flappa-doormal 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +7 -0
- package/README.md +357 -0
- package/dist/index.d.mts +460 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +517 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +50 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
import { makeDiacriticInsensitive } from "bitaboom";
|
|
2
|
+
|
|
3
|
+
//#region src/markers/defaults.ts
|
|
4
|
+
/**
|
|
5
|
+
* Default numbering style for markers
|
|
6
|
+
*/
|
|
7
|
+
const DEFAULT_NUMBERING = "arabic-indic";
|
|
8
|
+
/**
|
|
9
|
+
* Default separator style for markers
|
|
10
|
+
*/
|
|
11
|
+
const DEFAULT_SEPARATOR = "dash";
|
|
12
|
+
/**
|
|
13
|
+
* Default separator pattern (used when separator is a custom string)
|
|
14
|
+
*/
|
|
15
|
+
const DEFAULT_SEPARATOR_PATTERN = "[-–—ـ]";
|
|
16
|
+
/**
|
|
17
|
+
* Numbering patterns mapped by style
|
|
18
|
+
*/
|
|
19
|
+
const NUMBERING_PATTERNS = {
|
|
20
|
+
"arabic-indic": "[\\u0660-\\u0669]+",
|
|
21
|
+
"latin": "\\d+"
|
|
22
|
+
};
|
|
23
|
+
/**
|
|
24
|
+
* Separator patterns mapped by style
|
|
25
|
+
*/
|
|
26
|
+
const SEPARATOR_PATTERNS = {
|
|
27
|
+
"colon": ":",
|
|
28
|
+
"dash": "[-–—ـ]",
|
|
29
|
+
"dot": "\\.",
|
|
30
|
+
"none": "",
|
|
31
|
+
"paren": "\\)"
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
//#endregion
|
|
35
|
+
//#region src/markers/presets.ts
|
|
36
|
+
/**
|
|
37
|
+
* Default phrase lists for preset marker types.
|
|
38
|
+
* Export these so users can extend them.
|
|
39
|
+
*/
|
|
40
|
+
/**
|
|
41
|
+
* Common hadith narrator phrases (diacritic-insensitive)
|
|
42
|
+
* Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
|
|
43
|
+
*/
|
|
44
|
+
const DEFAULT_HADITH_PHRASES = [
|
|
45
|
+
"حَدَّثَنَا",
|
|
46
|
+
"حدثنا",
|
|
47
|
+
"أَخْبَرَنَا",
|
|
48
|
+
"حدثني",
|
|
49
|
+
"حدَّثني",
|
|
50
|
+
"وحدثنا",
|
|
51
|
+
"حُدِّثت عن",
|
|
52
|
+
"وحَدَّثَنَا"
|
|
53
|
+
];
|
|
54
|
+
/**
|
|
55
|
+
* Common basmala patterns
|
|
56
|
+
* Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
|
|
57
|
+
*/
|
|
58
|
+
const DEFAULT_BASMALA_PATTERNS = [
|
|
59
|
+
"بسم الله",
|
|
60
|
+
"\\[بسم",
|
|
61
|
+
"\\[تم"
|
|
62
|
+
];
|
|
63
|
+
|
|
64
|
+
//#endregion
|
|
65
|
+
//#region src/markers/tokens.ts
|
|
66
|
+
/**
|
|
67
|
+
* Token definitions for pattern templates.
|
|
68
|
+
* Tokens provide a readable alternative to raw regex patterns.
|
|
69
|
+
*/
|
|
70
|
+
/**
|
|
71
|
+
* Standard tokens for building marker patterns.
|
|
72
|
+
* Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
|
|
73
|
+
*/
|
|
74
|
+
const TOKENS = {
|
|
75
|
+
bullet: "[•*°]",
|
|
76
|
+
colon: ":",
|
|
77
|
+
comma: "،",
|
|
78
|
+
content: "(.*)",
|
|
79
|
+
dash: "[-–—ـ]",
|
|
80
|
+
dot: "\\.",
|
|
81
|
+
latin: "\\d+",
|
|
82
|
+
letter: "[أ-ي]",
|
|
83
|
+
num: "[\\u0660-\\u0669]+",
|
|
84
|
+
paren: "\\)",
|
|
85
|
+
s: "\\s?",
|
|
86
|
+
slash: "/",
|
|
87
|
+
space: "\\s+"
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
//#endregion
|
|
91
|
+
//#region src/markers/template-parser.ts
|
|
92
|
+
/**
|
|
93
|
+
* Expands a template string into a regex pattern using named capture groups.
|
|
94
|
+
* Always creates three groups: full (entire match), marker (just the marker), content (clean text).
|
|
95
|
+
*
|
|
96
|
+
* The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
|
|
97
|
+
*
|
|
98
|
+
* @param template - Template string with {token} placeholders
|
|
99
|
+
* @param options - Optional configuration
|
|
100
|
+
* @returns Regex pattern string with named groups
|
|
101
|
+
*
|
|
102
|
+
* @example
|
|
103
|
+
* expandTemplate('{num} {dash}')
|
|
104
|
+
* // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
|
|
105
|
+
*/
|
|
106
|
+
function expandTemplate(template, options) {
|
|
107
|
+
const tokenMap = options?.tokens || TOKENS;
|
|
108
|
+
let expandedMarker = template;
|
|
109
|
+
for (const [token, pattern] of Object.entries(tokenMap)) {
|
|
110
|
+
const placeholder = `{${token}}`;
|
|
111
|
+
expandedMarker = expandedMarker.replaceAll(placeholder, pattern);
|
|
112
|
+
}
|
|
113
|
+
return String.raw`^(?<full>(?<marker>${expandedMarker})(?<content>[\s\S]*))`;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Create a custom token map by extending the base tokens.
|
|
117
|
+
*
|
|
118
|
+
* @param customTokens - Custom token definitions
|
|
119
|
+
* @returns Combined token map
|
|
120
|
+
*
|
|
121
|
+
* @example
|
|
122
|
+
* const myTokens = createTokenMap({
|
|
123
|
+
* verse: '\\[[\\u0660-\\u0669]+\\]',
|
|
124
|
+
* tafsir: 'تفسير'
|
|
125
|
+
* });
|
|
126
|
+
*/
|
|
127
|
+
function createTokenMap(customTokens) {
|
|
128
|
+
return {
|
|
129
|
+
...TOKENS,
|
|
130
|
+
...customTokens
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Validates a template string.
|
|
135
|
+
*
|
|
136
|
+
* @param template - Template to validate
|
|
137
|
+
* @param tokens - Token map to validate against
|
|
138
|
+
* @returns Validation result with errors if invalid
|
|
139
|
+
*
|
|
140
|
+
* @example
|
|
141
|
+
* validateTemplate('{num} {dash}')
|
|
142
|
+
* // Returns: { valid: true }
|
|
143
|
+
*
|
|
144
|
+
* validateTemplate('{invalid}')
|
|
145
|
+
* // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
|
|
146
|
+
*/
|
|
147
|
+
function validateTemplate(template, tokens = TOKENS) {
|
|
148
|
+
const unknownTokens = (template.match(/\{(\w+)\}/g) || []).map((t) => t.slice(1, -1)).filter((name) => !tokens[name]);
|
|
149
|
+
if (unknownTokens.length > 0) return {
|
|
150
|
+
valid: false,
|
|
151
|
+
errors: [`Unknown tokens: ${unknownTokens.map((t) => `{${t}}`).join(", ")}`, `Available tokens: ${Object.keys(tokens).map((t) => `{${t}}`).join(", ")}`]
|
|
152
|
+
};
|
|
153
|
+
return { valid: true };
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
//#endregion
|
|
157
|
+
//#region src/markers/type-generators.ts
|
|
158
|
+
/**
|
|
159
|
+
* Generates a regular expression for pattern-type markers.
|
|
160
|
+
*
|
|
161
|
+
* Supports two modes:
|
|
162
|
+
* 1. Template-based: Uses the `template` field with token expansion
|
|
163
|
+
* 2. Pattern-based: Uses the raw `pattern` field as-is
|
|
164
|
+
*
|
|
165
|
+
* @param config - Marker configuration with either `template` or `pattern` field
|
|
166
|
+
* @returns A compiled RegExp object for matching the pattern
|
|
167
|
+
* @throws {Error} When neither `template` nor `pattern` is provided
|
|
168
|
+
*
|
|
169
|
+
* @example
|
|
170
|
+
* // Using template
|
|
171
|
+
* const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
|
|
172
|
+
*
|
|
173
|
+
* @example
|
|
174
|
+
* // Using raw pattern
|
|
175
|
+
* const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
|
|
176
|
+
*
|
|
177
|
+
* @example
|
|
178
|
+
* // Using custom tokens
|
|
179
|
+
* const regex = generatePatternRegex({
|
|
180
|
+
* type: 'pattern',
|
|
181
|
+
* template: '{verse}',
|
|
182
|
+
* tokens: { verse: '\\[[0-9]+\\]' }
|
|
183
|
+
* });
|
|
184
|
+
*/
|
|
185
|
+
function generatePatternRegex(config) {
|
|
186
|
+
if (config.template) {
|
|
187
|
+
const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;
|
|
188
|
+
const pattern = expandTemplate(config.template, { tokens: tokenMap });
|
|
189
|
+
return new RegExp(pattern, "u");
|
|
190
|
+
}
|
|
191
|
+
if (!config.pattern) throw new Error("pattern marker must provide either a template or pattern");
|
|
192
|
+
return new RegExp(config.pattern, "u");
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Generates a regular expression for 'bab' (chapter) markers.
|
|
196
|
+
*
|
|
197
|
+
* Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
|
|
198
|
+
* The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
|
|
199
|
+
*
|
|
200
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
201
|
+
*
|
|
202
|
+
* @example
|
|
203
|
+
* const regex = generateBabRegex();
|
|
204
|
+
* const match = regex.exec('باب الصلاة');
|
|
205
|
+
* // match.groups.marker -> 'باب'
|
|
206
|
+
* // match.groups.content -> ' الصلاة'
|
|
207
|
+
*/
|
|
208
|
+
function generateBabRegex() {
|
|
209
|
+
const babPattern = makeDiacriticInsensitive("باب");
|
|
210
|
+
const pattern = String.raw`^(?<full>(?<marker>${babPattern}[ًٌٍَُ]?)(?<content>[\s\S]*))`;
|
|
211
|
+
return new RegExp(pattern, "u");
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Generates a regular expression for hadith chain (isnad) markers.
|
|
215
|
+
*
|
|
216
|
+
* Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
|
|
217
|
+
* Uses default phrases from presets or custom phrases from config.
|
|
218
|
+
* All phrases are made diacritic-insensitive.
|
|
219
|
+
*
|
|
220
|
+
* @param config - Marker configuration with optional `phrases` array
|
|
221
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
222
|
+
*
|
|
223
|
+
* @example
|
|
224
|
+
* // Using default phrases
|
|
225
|
+
* const regex = generateHadithChainRegex({ type: 'hadith-chain' });
|
|
226
|
+
* const match = regex.exec('حَدَّثَنَا أبو بكر');
|
|
227
|
+
*
|
|
228
|
+
* @example
|
|
229
|
+
* // Using custom phrases
|
|
230
|
+
* const regex = generateHadithChainRegex({
|
|
231
|
+
* type: 'hadith-chain',
|
|
232
|
+
* phrases: ['قَالَ', 'رَوَى']
|
|
233
|
+
* });
|
|
234
|
+
*/
|
|
235
|
+
function generateHadithChainRegex(config) {
|
|
236
|
+
const phrasesPattern = (config.phrases || DEFAULT_HADITH_PHRASES).map((p) => makeDiacriticInsensitive(p)).join("|");
|
|
237
|
+
const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`;
|
|
238
|
+
return new RegExp(pattern, "u");
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Generates a regular expression for basmala markers.
|
|
242
|
+
*
|
|
243
|
+
* Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
|
|
244
|
+
* - بسم الله (without diacritics)
|
|
245
|
+
* - بِسْمِ اللَّهِ (with diacritics)
|
|
246
|
+
* - Special patterns like [بسم, [تم
|
|
247
|
+
*
|
|
248
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
249
|
+
*
|
|
250
|
+
* @example
|
|
251
|
+
* const regex = generateBasmalaRegex();
|
|
252
|
+
* const match = regex.exec('بسم الله الرحمن الرحيم');
|
|
253
|
+
* // match.groups.marker -> 'بسم الله'
|
|
254
|
+
*/
|
|
255
|
+
function generateBasmalaRegex() {
|
|
256
|
+
const combinedPattern = DEFAULT_BASMALA_PATTERNS.map((p) => makeDiacriticInsensitive(p)).join("|");
|
|
257
|
+
const pattern = String.raw`^(?<full>(?<marker>${combinedPattern})(?<content>[\s\S]*))`;
|
|
258
|
+
return new RegExp(pattern, "u");
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Generates a regular expression for custom phrase markers.
|
|
262
|
+
*
|
|
263
|
+
* Similar to hadith-chain markers but requires explicit phrase list.
|
|
264
|
+
* All phrases are made diacritic-insensitive.
|
|
265
|
+
*
|
|
266
|
+
* @param config - Marker configuration with required `phrases` array
|
|
267
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
268
|
+
* @throws {Error} When `phrases` is undefined or empty
|
|
269
|
+
*
|
|
270
|
+
* @example
|
|
271
|
+
* const regex = generatePhraseRegex({
|
|
272
|
+
* type: 'phrase',
|
|
273
|
+
* phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
|
|
274
|
+
* });
|
|
275
|
+
*/
|
|
276
|
+
function generatePhraseRegex(config) {
|
|
277
|
+
if (!config.phrases || config.phrases.length === 0) throw new Error("phrase marker requires phrases array");
|
|
278
|
+
const phrasesPattern = config.phrases.map((p) => makeDiacriticInsensitive(p)).join("|");
|
|
279
|
+
const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`;
|
|
280
|
+
return new RegExp(pattern, "u");
|
|
281
|
+
}
|
|
282
|
+
/**
|
|
283
|
+
* Generates a regular expression for square bracket markers.
|
|
284
|
+
*
|
|
285
|
+
* Matches verse or hadith reference numbers in square brackets:
|
|
286
|
+
* - [٦٥] - Simple bracket
|
|
287
|
+
* - • [٦٥] - With bullet prefix
|
|
288
|
+
* - ° [٦٥] - With degree prefix
|
|
289
|
+
*
|
|
290
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
291
|
+
*
|
|
292
|
+
* @example
|
|
293
|
+
* const regex = generateSquareBracketRegex();
|
|
294
|
+
* const match = regex.exec('[٦٥] نص الحديث');
|
|
295
|
+
* // match.groups.content -> ' نص الحديث'
|
|
296
|
+
*/
|
|
297
|
+
function generateSquareBracketRegex() {
|
|
298
|
+
const markerPattern = String.raw`[•°]?\s?\[[\u0660-\u0669]+\]\s?`;
|
|
299
|
+
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
300
|
+
return new RegExp(pattern, "u");
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* Generates a regular expression for number-letter-separator markers.
|
|
304
|
+
*
|
|
305
|
+
* Matches patterns like:
|
|
306
|
+
* - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
|
|
307
|
+
* - 5 ب. (Latin number, Arabic letter, dot)
|
|
308
|
+
*
|
|
309
|
+
* @param config - Configuration with required `numbering` and `separator` fields
|
|
310
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
311
|
+
*
|
|
312
|
+
* @example
|
|
313
|
+
* const regex = generateNumLetterRegex({
|
|
314
|
+
* numbering: 'arabic-indic',
|
|
315
|
+
* separator: 'dash'
|
|
316
|
+
* });
|
|
317
|
+
* const match = regex.exec('٥ أ - نص');
|
|
318
|
+
*/
|
|
319
|
+
function generateNumLetterRegex(config) {
|
|
320
|
+
const numPattern = NUMBERING_PATTERNS[config.numbering];
|
|
321
|
+
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
|
|
322
|
+
const markerPattern = String.raw`${numPattern} [أ-ي]\s?${sepPattern}`;
|
|
323
|
+
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
324
|
+
return new RegExp(pattern, "u");
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Generates a regular expression for number-parenthetical-separator markers.
|
|
328
|
+
*
|
|
329
|
+
* Matches patterns like:
|
|
330
|
+
* - ٥ (أ) - (number, parenthetical content, separator)
|
|
331
|
+
* - 5 (٦) - (number with parenthetical number)
|
|
332
|
+
*
|
|
333
|
+
* @param config - Configuration with required `numbering` and `separator` fields
|
|
334
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
335
|
+
*
|
|
336
|
+
* @example
|
|
337
|
+
* const regex = generateNumParenRegex({
|
|
338
|
+
* numbering: 'arabic-indic',
|
|
339
|
+
* separator: 'dash'
|
|
340
|
+
* });
|
|
341
|
+
* const match = regex.exec('٥ (أ) - نص');
|
|
342
|
+
*/
|
|
343
|
+
function generateNumParenRegex(config) {
|
|
344
|
+
const numPattern = NUMBERING_PATTERNS[config.numbering];
|
|
345
|
+
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
|
|
346
|
+
const markerPattern = String.raw`${numPattern}\s*\([\u0600-\u06FF\u0660-\u0669\s]+\)\s?${sepPattern}`;
|
|
347
|
+
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
348
|
+
return new RegExp(pattern, "u");
|
|
349
|
+
}
|
|
350
|
+
/**
|
|
351
|
+
* Generates a regular expression for number-slash-number markers.
|
|
352
|
+
*
|
|
353
|
+
* Matches patterns like:
|
|
354
|
+
* - ٥/٦ - (number slash number, separator)
|
|
355
|
+
* - ٥ - (single number, separator)
|
|
356
|
+
*
|
|
357
|
+
* The second number after the slash is optional.
|
|
358
|
+
*
|
|
359
|
+
* @param config - Configuration with required `numbering` and `separator` fields
|
|
360
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
361
|
+
*
|
|
362
|
+
* @example
|
|
363
|
+
* const regex = generateNumSlashRegex({
|
|
364
|
+
* numbering: 'arabic-indic',
|
|
365
|
+
* separator: 'dash'
|
|
366
|
+
* });
|
|
367
|
+
* const match1 = regex.exec('٥/٦ - نص');
|
|
368
|
+
* const match2 = regex.exec('٥ - نص'); // Also matches
|
|
369
|
+
*/
|
|
370
|
+
function generateNumSlashRegex(config) {
|
|
371
|
+
const numPattern = NUMBERING_PATTERNS[config.numbering];
|
|
372
|
+
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
|
|
373
|
+
const markerPattern = String.raw`${numPattern}(?:\s?/\s?${numPattern})?\s?${sepPattern}`;
|
|
374
|
+
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
375
|
+
return new RegExp(pattern, "u");
|
|
376
|
+
}
|
|
377
|
+
/**
|
|
378
|
+
* Generates a regular expression for numbered markers with optional format template.
|
|
379
|
+
*
|
|
380
|
+
* Supports two modes:
|
|
381
|
+
* 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
|
|
382
|
+
* 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
|
|
383
|
+
*
|
|
384
|
+
* When using default pattern:
|
|
385
|
+
* - Separator 'none' generates pattern without separator
|
|
386
|
+
* - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
|
|
387
|
+
*
|
|
388
|
+
* @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
|
|
389
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
390
|
+
*
|
|
391
|
+
* @example
|
|
392
|
+
* // Using format template
|
|
393
|
+
* const regex = generateNumberedRegex({
|
|
394
|
+
* numbering: 'arabic-indic',
|
|
395
|
+
* separator: 'dash',
|
|
396
|
+
* format: '{bullet}+ {num} {dash}'
|
|
397
|
+
* });
|
|
398
|
+
*
|
|
399
|
+
* @example
|
|
400
|
+
* // Using default pattern
|
|
401
|
+
* const regex = generateNumberedRegex({
|
|
402
|
+
* numbering: 'arabic-indic',
|
|
403
|
+
* separator: 'dash'
|
|
404
|
+
* });
|
|
405
|
+
* const match = regex.exec('٥ - نص');
|
|
406
|
+
*
|
|
407
|
+
* @example
|
|
408
|
+
* // With 'none' separator
|
|
409
|
+
* const regex = generateNumberedRegex({
|
|
410
|
+
* numbering: 'latin',
|
|
411
|
+
* separator: 'none'
|
|
412
|
+
* });
|
|
413
|
+
* const match = regex.exec('5 text');
|
|
414
|
+
*/
|
|
415
|
+
function generateNumberedRegex(config) {
|
|
416
|
+
if (config.format) {
|
|
417
|
+
const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;
|
|
418
|
+
const expandedPattern = expandTemplate(config.format, { tokens: tokenMap });
|
|
419
|
+
return new RegExp(expandedPattern, "u");
|
|
420
|
+
}
|
|
421
|
+
const numPattern = NUMBERING_PATTERNS[config.numbering];
|
|
422
|
+
const separator = config.separator;
|
|
423
|
+
const sepPattern = separator !== "none" ? SEPARATOR_PATTERNS[separator] ?? separator : "";
|
|
424
|
+
const markerPattern = sepPattern ? String.raw`${numPattern}\s?${sepPattern}` : numPattern;
|
|
425
|
+
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
|
|
426
|
+
return new RegExp(pattern, "u");
|
|
427
|
+
}
|
|
428
|
+
/**
|
|
429
|
+
* Generates a regular expression for bullet-point markers.
|
|
430
|
+
*
|
|
431
|
+
* Matches common bullet characters:
|
|
432
|
+
* - • (bullet)
|
|
433
|
+
* - * (asterisk)
|
|
434
|
+
* - ° (degree)
|
|
435
|
+
* - - (dash)
|
|
436
|
+
*
|
|
437
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
438
|
+
*
|
|
439
|
+
* @example
|
|
440
|
+
* const regex = generateBulletRegex();
|
|
441
|
+
* const match = regex.exec('• نقطة');
|
|
442
|
+
* // match.groups.content -> 'نقطة'
|
|
443
|
+
*/
|
|
444
|
+
function generateBulletRegex() {
|
|
445
|
+
return new RegExp("^(?<full>(?<marker>[•*°\\-]\\s?)(?<content>[\\s\\S]*))", "u");
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Generates a regular expression for Markdown-style heading markers.
|
|
449
|
+
*
|
|
450
|
+
* Matches heading levels using hash symbols:
|
|
451
|
+
* - # Heading 1
|
|
452
|
+
* - ## Heading 2
|
|
453
|
+
* - ### Heading 3
|
|
454
|
+
* - etc.
|
|
455
|
+
*
|
|
456
|
+
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
|
|
457
|
+
*
|
|
458
|
+
* @example
|
|
459
|
+
* const regex = generateHeadingRegex();
|
|
460
|
+
* const match = regex.exec('## عنوان فرعي');
|
|
461
|
+
* // match.groups.marker -> '## '
|
|
462
|
+
* // match.groups.content -> 'عنوان فرعي'
|
|
463
|
+
*/
|
|
464
|
+
function generateHeadingRegex() {
|
|
465
|
+
return new RegExp("^(?<full>(?<marker>#+\\s?)(?<content>[\\s\\S]*))", "u");
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
//#endregion
|
|
469
|
+
//#region src/markers/generator.ts
|
|
470
|
+
/**
|
|
471
|
+
* Generates a regex pattern from a marker configuration.
|
|
472
|
+
* Always returns a regex with three named capture groups:
|
|
473
|
+
* - full: Complete match including marker
|
|
474
|
+
* - marker: Just the marker part (for metadata/indexing)
|
|
475
|
+
* - content: Clean content without marker (for LLM processing)
|
|
476
|
+
*
|
|
477
|
+
* This function applies all default values before delegating to type-specific generators.
|
|
478
|
+
*
|
|
479
|
+
* @param config - Marker configuration
|
|
480
|
+
* @returns Regular expression with named groups
|
|
481
|
+
*
|
|
482
|
+
* @example
|
|
483
|
+
* const regex = generateRegexFromMarker({ type: 'numbered' });
|
|
484
|
+
* const match = regex.exec('٥ - نص');
|
|
485
|
+
* match.groups.full // "٥ - نص"
|
|
486
|
+
* match.groups.marker // "٥ -"
|
|
487
|
+
* match.groups.content // "نص"
|
|
488
|
+
*/
|
|
489
|
+
function generateRegexFromMarker(config) {
|
|
490
|
+
const normalized = {
|
|
491
|
+
numbering: config.numbering ?? DEFAULT_NUMBERING,
|
|
492
|
+
separator: config.separator ?? DEFAULT_SEPARATOR,
|
|
493
|
+
...config
|
|
494
|
+
};
|
|
495
|
+
switch (normalized.type) {
|
|
496
|
+
case "pattern": return generatePatternRegex(normalized);
|
|
497
|
+
case "bab": return generateBabRegex();
|
|
498
|
+
case "hadith-chain": return generateHadithChainRegex(normalized);
|
|
499
|
+
case "basmala": return generateBasmalaRegex();
|
|
500
|
+
case "phrase": return generatePhraseRegex(normalized);
|
|
501
|
+
case "square-bracket": return generateSquareBracketRegex();
|
|
502
|
+
case "num-letter": return generateNumLetterRegex(normalized);
|
|
503
|
+
case "num-paren": return generateNumParenRegex(normalized);
|
|
504
|
+
case "num-slash": return generateNumSlashRegex(normalized);
|
|
505
|
+
case "numbered": return generateNumberedRegex(normalized);
|
|
506
|
+
case "bullet": return generateBulletRegex();
|
|
507
|
+
case "heading": return generateHeadingRegex();
|
|
508
|
+
default: {
|
|
509
|
+
const _exhaustive = normalized.type;
|
|
510
|
+
throw new Error(`Unknown marker type: ${_exhaustive}`);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
//#endregion
|
|
516
|
+
export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, NUMBERING_PATTERNS, SEPARATOR_PATTERNS, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
|
|
517
|
+
//# sourceMappingURL=index.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.mjs","names":["DEFAULT_NUMBERING: NumberingStyle","DEFAULT_SEPARATOR: SeparatorStyle","NUMBERING_PATTERNS: Record<NumberingStyle, string>","SEPARATOR_PATTERNS: Record<SeparatorStyle, string>","normalized: NormalizedMarkerConfig","_exhaustive: never"],"sources":["../src/markers/defaults.ts","../src/markers/presets.ts","../src/markers/tokens.ts","../src/markers/template-parser.ts","../src/markers/type-generators.ts","../src/markers/generator.ts"],"sourcesContent":["/**\n * Default configuration values for marker patterns\n * All defaults are centralized here as a single source of truth\n */\n\nimport type { NumberingStyle, SeparatorStyle } from '../types.js';\n\n/**\n * Default numbering style for markers\n */\nexport const DEFAULT_NUMBERING: NumberingStyle = 'arabic-indic';\n\n/**\n * Default separator style for markers\n */\nexport const DEFAULT_SEPARATOR: SeparatorStyle = 'dash';\n\n/**\n * Default separator pattern (used when separator is a custom string)\n */\nexport const DEFAULT_SEPARATOR_PATTERN = '[-–—ـ]';\n\n/**\n * Numbering patterns mapped by style\n */\nexport const NUMBERING_PATTERNS: Record<NumberingStyle, string> = {\n 'arabic-indic': '[\\\\u0660-\\\\u0669]+',\n 'latin': '\\\\d+',\n};\n\n/**\n * Separator patterns mapped by style\n */\nexport const SEPARATOR_PATTERNS: Record<SeparatorStyle, string> = {\n 'colon': ':',\n 'dash': '[-–—ـ]',\n 'dot': '\\\\.',\n 'none': '',\n 'paren': '\\\\)',\n};\n","/**\n * Default phrase lists for preset marker types.\n * Export these so users can extend them.\n */\n\n/**\n * Common hadith narrator phrases (diacritic-insensitive)\n * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']\n */\nexport const DEFAULT_HADITH_PHRASES = [\n 'حَدَّثَنَا',\n 'حدثنا',\n 'أَخْبَرَنَا',\n 'حدثني',\n 'حدَّثني',\n 'وحدثنا',\n 'حُدِّثت عن',\n 'وحَدَّثَنَا',\n] as const;\n\n/**\n * Common basmala patterns\n * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']\n */\nexport const DEFAULT_BASMALA_PATTERNS = ['بسم الله', '\\\\[بسم', '\\\\[تم'] as const;\n","/**\n * Token definitions for pattern templates.\n * Tokens provide a readable alternative to raw regex patterns.\n */\n\n/**\n * Standard tokens for building marker patterns.\n * Use these in templates like: '{num} {dash}' instead of '[\\\\u0660-\\\\u0669]+ [-–—ـ]'\n */\nexport const TOKENS = {\n // Special characters\n bullet: '[•*°]', // Bullet point variants\n colon: ':', // Colon\n comma: '،', // Arabic comma\n content: '(.*)', // Capture rest of line\n\n // Separators\n dash: '[-–—ـ]', // Various dash types\n dot: '\\\\.', // Period\n latin: '\\\\d+', // Latin numerals\n letter: '[أ-ي]', // Arabic letters\n // Numbers\n num: '[\\\\u0660-\\\\u0669]+', // Arabic-Indic numerals\n paren: '\\\\)', // Closing parenthesis\n s: '\\\\s?', // Optional whitespace\n slash: '/', // Forward slash\n\n // Structural\n space: '\\\\s+', // One or more whitespace\n} as const;\n\nexport type TokenName = keyof typeof TOKENS;\nexport type TokenMap = Record<string, string>;\n","import type { TokenMap } from './tokens.js';\nimport { TOKENS } from './tokens.js';\n\n/**\n * Result of template validation\n */\nexport interface ValidationResult {\n valid: boolean;\n errors?: string[];\n}\n\n/**\n * Options for template expansion\n */\nexport interface ExpandOptions {\n /** Custom token map to use instead of default TOKENS */\n tokens?: TokenMap;\n}\n\n/**\n * Expands a template string into a regex pattern using named capture groups.\n * Always creates three groups: full (entire match), marker (just the marker), content (clean text).\n * \n * The content group uses [\\s\\S]*? (non-greedy) to match across newlines but stop at next marker.\n * \n * @param template - Template string with {token} placeholders\n * @param options - Optional configuration\n * @returns Regex pattern string with named groups\n * \n * @example\n * expandTemplate('{num} {dash}')\n * // Returns: ^(?<full>(?<marker>[\\\\u0660-\\\\u0669]+\\\\s?[-–—ـ])(?<content>[\\\\s\\\\S]*?))\n */\nexport function expandTemplate(template: string, options?: ExpandOptions): string {\n const tokenMap = options?.tokens || TOKENS;\n\n // Replace {token} placeholders with actual patterns\n let expandedMarker = template;\n for (const [token, pattern] of Object.entries(tokenMap)) {\n const placeholder = `{${token}}`;\n expandedMarker = expandedMarker.replaceAll(placeholder, pattern);\n }\n\n // Always create three named groups:\n // - full: complete match (for segmentation)\n // - marker: just the marker part (for metadata/indexing)\n // - content: clean content (for LLM processing) - uses [\\s\\S]* to match newlines\n // Note: greedy * is correct here - ilmtest-cli must split content by marker positions\n return String.raw`^(?<full>(?<marker>${expandedMarker})(?<content>[\\s\\S]*))`;\n}\n\n/**\n * Create a custom token map by extending the base tokens.\n * \n * @param customTokens - Custom token definitions\n * @returns Combined token map\n * \n * @example\n * const myTokens = createTokenMap({\n * verse: '\\\\[[\\\\u0660-\\\\u0669]+\\\\]',\n * tafsir: 'تفسير'\n * });\n */\nexport function createTokenMap(customTokens: Record<string, string>): TokenMap {\n return { ...TOKENS, ...customTokens };\n}\n\n/**\n * Validates a template string.\n * \n * @param template - Template to validate\n * @param tokens - Token map to validate against\n * @returns Validation result with errors if invalid\n * \n * @example\n * validateTemplate('{num} {dash}')\n * // Returns: { valid: true }\n * \n * validateTemplate('{invalid}')\n * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }\n */\nexport function validateTemplate(template: string, tokens: TokenMap = TOKENS): ValidationResult {\n const tokenMatches = template.match(/\\{(\\w+)\\}/g) || [];\n const tokenNames = tokenMatches.map(t => t.slice(1, -1));\n const unknownTokens = tokenNames.filter(name => !tokens[name]);\n\n if (unknownTokens.length > 0) {\n return {\n valid: false,\n errors: [\n `Unknown tokens: ${unknownTokens.map(t => `{${t}}`).join(', ')}`,\n `Available tokens: ${Object.keys(tokens).map(t => `{${t}}`).join(', ')}`\n ]\n };\n }\n\n return { valid: true };\n}\n","import { makeDiacriticInsensitive } from 'bitaboom';\nimport type { MarkerConfig } from '@/types.js';\nimport { NUMBERING_PATTERNS, SEPARATOR_PATTERNS } from './defaults.js';\nimport { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES } from './presets.js';\nimport { createTokenMap, expandTemplate } from './template-parser.js';\nimport { TOKENS } from './tokens.js';\n\n/**\n * Generates a regular expression for pattern-type markers.\n *\n * Supports two modes:\n * 1. Template-based: Uses the `template` field with token expansion\n * 2. Pattern-based: Uses the raw `pattern` field as-is\n *\n * @param config - Marker configuration with either `template` or `pattern` field\n * @returns A compiled RegExp object for matching the pattern\n * @throws {Error} When neither `template` nor `pattern` is provided\n *\n * @example\n * // Using template\n * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });\n *\n * @example\n * // Using raw pattern\n * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\\\d+' });\n *\n * @example\n * // Using custom tokens\n * const regex = generatePatternRegex({\n * type: 'pattern',\n * template: '{verse}',\n * tokens: { verse: '\\\\[[0-9]+\\\\]' }\n * });\n */\nexport function generatePatternRegex(config: MarkerConfig): RegExp {\n if (config.template) {\n const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;\n const pattern = expandTemplate(config.template, {\n tokens: tokenMap,\n });\n return new RegExp(pattern, 'u');\n }\n\n if (!config.pattern) {\n throw new Error('pattern marker must provide either a template or pattern');\n }\n return new RegExp(config.pattern, 'u');\n}\n\n/**\n * Generates a regular expression for 'bab' (chapter) markers.\n *\n * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.\n * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.\n *\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateBabRegex();\n * const match = regex.exec('باب الصلاة');\n * // match.groups.marker -> 'باب'\n * // match.groups.content -> ' الصلاة'\n */\nexport function generateBabRegex(): RegExp {\n const babPattern = makeDiacriticInsensitive('باب');\n const pattern = String.raw`^(?<full>(?<marker>${babPattern}[ًٌٍَُ]?)(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for hadith chain (isnad) markers.\n *\n * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.\n * Uses default phrases from presets or custom phrases from config.\n * All phrases are made diacritic-insensitive.\n *\n * @param config - Marker configuration with optional `phrases` array\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * // Using default phrases\n * const regex = generateHadithChainRegex({ type: 'hadith-chain' });\n * const match = regex.exec('حَدَّثَنَا أبو بكر');\n *\n * @example\n * // Using custom phrases\n * const regex = generateHadithChainRegex({\n * type: 'hadith-chain',\n * phrases: ['قَالَ', 'رَوَى']\n * });\n */\nexport function generateHadithChainRegex(config: MarkerConfig): RegExp {\n const phrases = config.phrases || DEFAULT_HADITH_PHRASES;\n const phrasesPattern = phrases.map((p) => makeDiacriticInsensitive(p)).join('|');\n const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for basmala markers.\n *\n * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):\n * - بسم الله (without diacritics)\n * - بِسْمِ اللَّهِ (with diacritics)\n * - Special patterns like [بسم, [تم\n *\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateBasmalaRegex();\n * const match = regex.exec('بسم الله الرحمن الرحيم');\n * // match.groups.marker -> 'بسم الله'\n */\nexport function generateBasmalaRegex(): RegExp {\n const patterns = DEFAULT_BASMALA_PATTERNS.map((p) => makeDiacriticInsensitive(p));\n const combinedPattern = patterns.join('|');\n const pattern = String.raw`^(?<full>(?<marker>${combinedPattern})(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for custom phrase markers.\n *\n * Similar to hadith-chain markers but requires explicit phrase list.\n * All phrases are made diacritic-insensitive.\n *\n * @param config - Marker configuration with required `phrases` array\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n * @throws {Error} When `phrases` is undefined or empty\n *\n * @example\n * const regex = generatePhraseRegex({\n * type: 'phrase',\n * phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']\n * });\n */\nexport function generatePhraseRegex(config: MarkerConfig): RegExp {\n if (!config.phrases || config.phrases.length === 0) {\n throw new Error('phrase marker requires phrases array');\n }\n const phrasesPattern = config.phrases.map((p) => makeDiacriticInsensitive(p)).join('|');\n const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for square bracket markers.\n *\n * Matches verse or hadith reference numbers in square brackets:\n * - [٦٥] - Simple bracket\n * - • [٦٥] - With bullet prefix\n * - ° [٦٥] - With degree prefix\n *\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateSquareBracketRegex();\n * const match = regex.exec('[٦٥] نص الحديث');\n * // match.groups.content -> ' نص الحديث'\n */\nexport function generateSquareBracketRegex(): RegExp {\n const markerPattern = String.raw`[•°]?\\s?\\[[\\u0660-\\u0669]+\\]\\s?`;\n const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for number-letter-separator markers.\n *\n * Matches patterns like:\n * - ٥ أ - (Arabic-Indic number, Arabic letter, dash)\n * - 5 ب. (Latin number, Arabic letter, dot)\n *\n * @param config - Configuration with required `numbering` and `separator` fields\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateNumLetterRegex({\n * numbering: 'arabic-indic',\n * separator: 'dash'\n * });\n * const match = regex.exec('٥ أ - نص');\n */\nexport function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp {\n const numPattern = NUMBERING_PATTERNS[config.numbering as keyof typeof NUMBERING_PATTERNS];\n const sepPattern = SEPARATOR_PATTERNS[config.separator as keyof typeof SEPARATOR_PATTERNS] ?? config.separator;\n const markerPattern = String.raw`${numPattern} [أ-ي]\\s?${sepPattern}`;\n const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for number-parenthetical-separator markers.\n *\n * Matches patterns like:\n * - ٥ (أ) - (number, parenthetical content, separator)\n * - 5 (٦) - (number with parenthetical number)\n *\n * @param config - Configuration with required `numbering` and `separator` fields\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateNumParenRegex({\n * numbering: 'arabic-indic',\n * separator: 'dash'\n * });\n * const match = regex.exec('٥ (أ) - نص');\n */\nexport function generateNumParenRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp {\n const numPattern = NUMBERING_PATTERNS[config.numbering as keyof typeof NUMBERING_PATTERNS];\n const sepPattern = SEPARATOR_PATTERNS[config.separator as keyof typeof SEPARATOR_PATTERNS] ?? config.separator;\n const markerPattern = String.raw`${numPattern}\\s*\\([\\u0600-\\u06FF\\u0660-\\u0669\\s]+\\)\\s?${sepPattern}`;\n const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for number-slash-number markers.\n *\n * Matches patterns like:\n * - ٥/٦ - (number slash number, separator)\n * - ٥ - (single number, separator)\n *\n * The second number after the slash is optional.\n *\n * @param config - Configuration with required `numbering` and `separator` fields\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateNumSlashRegex({\n * numbering: 'arabic-indic',\n * separator: 'dash'\n * });\n * const match1 = regex.exec('٥/٦ - نص');\n * const match2 = regex.exec('٥ - نص'); // Also matches\n */\nexport function generateNumSlashRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp {\n const numPattern = NUMBERING_PATTERNS[config.numbering as keyof typeof NUMBERING_PATTERNS];\n const sepPattern = SEPARATOR_PATTERNS[config.separator as keyof typeof SEPARATOR_PATTERNS] ?? config.separator;\n const markerPattern = String.raw`${numPattern}(?:\\s?/\\s?${numPattern})?\\s?${sepPattern}`;\n const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\\s\\S]*))`;\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for numbered markers with optional format template.\n *\n * Supports two modes:\n * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')\n * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers\n *\n * When using default pattern:\n * - Separator 'none' generates pattern without separator\n * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS\n *\n * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * // Using format template\n * const regex = generateNumberedRegex({\n * numbering: 'arabic-indic',\n * separator: 'dash',\n * format: '{bullet}+ {num} {dash}'\n * });\n *\n * @example\n * // Using default pattern\n * const regex = generateNumberedRegex({\n * numbering: 'arabic-indic',\n * separator: 'dash'\n * });\n * const match = regex.exec('٥ - نص');\n *\n * @example\n * // With 'none' separator\n * const regex = generateNumberedRegex({\n * numbering: 'latin',\n * separator: 'none'\n * });\n * const match = regex.exec('5 text');\n */\nexport function generateNumberedRegex(\n config: Pick<MarkerConfig, 'numbering' | 'separator' | 'format' | 'tokens'>,\n): RegExp {\n if (config.format) {\n const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;\n const expandedPattern = expandTemplate(config.format, {\n tokens: tokenMap,\n });\n return new RegExp(expandedPattern, 'u');\n }\n\n const numPattern = NUMBERING_PATTERNS[config.numbering as keyof typeof NUMBERING_PATTERNS];\n const separator = config.separator;\n const sepPattern =\n separator !== 'none' ? (SEPARATOR_PATTERNS[separator as keyof typeof SEPARATOR_PATTERNS] ?? separator) : '';\n\n const markerPattern = sepPattern ? String.raw`${numPattern}\\s?${sepPattern}` : numPattern;\n const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\\s\\S]*))`;\n\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for bullet-point markers.\n *\n * Matches common bullet characters:\n * - • (bullet)\n * - * (asterisk)\n * - ° (degree)\n * - - (dash)\n *\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateBulletRegex();\n * const match = regex.exec('• نقطة');\n * // match.groups.content -> 'نقطة'\n */\nexport function generateBulletRegex(): RegExp {\n const pattern = '^(?<full>(?<marker>[•*°\\\\-]\\\\s?)(?<content>[\\\\s\\\\S]*))';\n return new RegExp(pattern, 'u');\n}\n\n/**\n * Generates a regular expression for Markdown-style heading markers.\n *\n * Matches heading levels using hash symbols:\n * - # Heading 1\n * - ## Heading 2\n * - ### Heading 3\n * - etc.\n *\n * @returns A compiled RegExp with named groups: `full`, `marker`, `content`\n *\n * @example\n * const regex = generateHeadingRegex();\n * const match = regex.exec('## عنوان فرعي');\n * // match.groups.marker -> '## '\n * // match.groups.content -> 'عنوان فرعي'\n */\nexport function generateHeadingRegex(): RegExp {\n const pattern = '^(?<full>(?<marker>#+\\\\s?)(?<content>[\\\\s\\\\S]*))';\n return new RegExp(pattern, 'u');\n}\n","/**\n * Main entry point for marker regex generation\n * Delegates to type-specific generators\n */\n\nimport type { MarkerConfig } from '../types.js';\nimport { DEFAULT_NUMBERING, DEFAULT_SEPARATOR } from './defaults.js';\nimport {\n generateBabRegex,\n generateBasmalaRegex,\n generateBulletRegex,\n generateHadithChainRegex,\n generateHeadingRegex,\n generateNumberedRegex,\n generateNumLetterRegex,\n generateNumParenRegex,\n generateNumSlashRegex,\n generatePatternRegex,\n generatePhraseRegex,\n generateSquareBracketRegex,\n} from './type-generators.js';\n\n/**\n * Normalized marker config with all defaults applied\n * This ensures generators always receive complete configurations\n */\ntype NormalizedMarkerConfig = Required<Pick<MarkerConfig, 'numbering' | 'separator'>> & MarkerConfig;\n\n/**\n * Generates a regex pattern from a marker configuration.\n * Always returns a regex with three named capture groups:\n * - full: Complete match including marker\n * - marker: Just the marker part (for metadata/indexing)\n * - content: Clean content without marker (for LLM processing)\n * \n * This function applies all default values before delegating to type-specific generators.\n * \n * @param config - Marker configuration\n * @returns Regular expression with named groups\n * \n * @example\n * const regex = generateRegexFromMarker({ type: 'numbered' });\n * const match = regex.exec('٥ - نص');\n * match.groups.full // \"٥ - نص\"\n * match.groups.marker // \"٥ -\"\n * match.groups.content // \"نص\"\n */\nexport function generateRegexFromMarker(config: MarkerConfig): RegExp {\n // Apply all defaults in one place - single source of truth\n const normalized: NormalizedMarkerConfig = {\n numbering: config.numbering ?? DEFAULT_NUMBERING,\n separator: config.separator ?? DEFAULT_SEPARATOR,\n ...config,\n };\n\n // Delegate to type-specific generators\n // Generators now receive normalized config with all defaults applied\n switch (normalized.type) {\n case 'pattern':\n return generatePatternRegex(normalized);\n case 'bab':\n return generateBabRegex();\n case 'hadith-chain':\n return generateHadithChainRegex(normalized);\n case 'basmala':\n return generateBasmalaRegex();\n case 'phrase':\n return generatePhraseRegex(normalized);\n case 'square-bracket':\n return generateSquareBracketRegex();\n case 'num-letter':\n return generateNumLetterRegex(normalized);\n case 'num-paren':\n return generateNumParenRegex(normalized);\n case 'num-slash':\n return generateNumSlashRegex(normalized);\n case 'numbered':\n return generateNumberedRegex(normalized);\n case 'bullet':\n return generateBulletRegex();\n case 'heading':\n return generateHeadingRegex();\n default: {\n // TypeScript exhaustiveness check\n const _exhaustive: never = normalized.type;\n throw new Error(`Unknown marker type: ${_exhaustive}`);\n }\n }\n}\n"],"mappings":";;;;;;AAUA,MAAaA,oBAAoC;;;;AAKjD,MAAaC,oBAAoC;;;;AAKjD,MAAa,4BAA4B;;;;AAKzC,MAAaC,qBAAqD;CAC9D,gBAAgB;CAChB,SAAS;CACZ;;;;AAKD,MAAaC,qBAAqD;CAC9D,SAAS;CACT,QAAQ;CACR,OAAO;CACP,QAAQ;CACR,SAAS;CACZ;;;;;;;;;;;;AC9BD,MAAa,yBAAyB;CAClC;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACH;;;;;AAMD,MAAa,2BAA2B;CAAC;CAAY;CAAU;CAAQ;;;;;;;;;;;;ACfvE,MAAa,SAAS;CAElB,QAAQ;CACR,OAAO;CACP,OAAO;CACP,SAAS;CAGT,MAAM;CACN,KAAK;CACL,OAAO;CACP,QAAQ;CAER,KAAK;CACL,OAAO;CACP,GAAG;CACH,OAAO;CAGP,OAAO;CACV;;;;;;;;;;;;;;;;;;ACID,SAAgB,eAAe,UAAkB,SAAiC;CAC9E,MAAM,WAAW,SAAS,UAAU;CAGpC,IAAI,iBAAiB;AACrB,MAAK,MAAM,CAAC,OAAO,YAAY,OAAO,QAAQ,SAAS,EAAE;EACrD,MAAM,cAAc,IAAI,MAAM;AAC9B,mBAAiB,eAAe,WAAW,aAAa,QAAQ;;AAQpE,QAAO,OAAO,GAAG,sBAAsB,eAAe;;;;;;;;;;;;;;AAe1D,SAAgB,eAAe,cAAgD;AAC3E,QAAO;EAAE,GAAG;EAAQ,GAAG;EAAc;;;;;;;;;;;;;;;;AAiBzC,SAAgB,iBAAiB,UAAkB,SAAmB,QAA0B;CAG5F,MAAM,iBAFe,SAAS,MAAM,aAAa,IAAI,EAAE,EACvB,KAAI,MAAK,EAAE,MAAM,GAAG,GAAG,CAAC,CACvB,QAAO,SAAQ,CAAC,OAAO,MAAM;AAE9D,KAAI,cAAc,SAAS,EACvB,QAAO;EACH,OAAO;EACP,QAAQ,CACJ,mBAAmB,cAAc,KAAI,MAAK,IAAI,EAAE,GAAG,CAAC,KAAK,KAAK,IAC9D,qBAAqB,OAAO,KAAK,OAAO,CAAC,KAAI,MAAK,IAAI,EAAE,GAAG,CAAC,KAAK,KAAK,GACzE;EACJ;AAGL,QAAO,EAAE,OAAO,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AC9D1B,SAAgB,qBAAqB,QAA8B;AAC/D,KAAI,OAAO,UAAU;EACjB,MAAM,WAAW,OAAO,SAAS,eAAe,OAAO,OAAO,GAAG;EACjE,MAAM,UAAU,eAAe,OAAO,UAAU,EAC5C,QAAQ,UACX,CAAC;AACF,SAAO,IAAI,OAAO,SAAS,IAAI;;AAGnC,KAAI,CAAC,OAAO,QACR,OAAM,IAAI,MAAM,2DAA2D;AAE/E,QAAO,IAAI,OAAO,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;AAiB1C,SAAgB,mBAA2B;CACvC,MAAM,aAAa,yBAAyB,MAAM;CAClD,MAAM,UAAU,OAAO,GAAG,sBAAsB,WAAW;AAC3D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;;;;;;;;AAyBnC,SAAgB,yBAAyB,QAA8B;CAEnE,MAAM,kBADU,OAAO,WAAW,wBACH,KAAK,MAAM,yBAAyB,EAAE,CAAC,CAAC,KAAK,IAAI;CAChF,MAAM,UAAU,OAAO,GAAG,sBAAsB,eAAe;AAC/D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;AAkBnC,SAAgB,uBAA+B;CAE3C,MAAM,kBADW,yBAAyB,KAAK,MAAM,yBAAyB,EAAE,CAAC,CAChD,KAAK,IAAI;CAC1C,MAAM,UAAU,OAAO,GAAG,sBAAsB,gBAAgB;AAChE,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;;AAmBnC,SAAgB,oBAAoB,QAA8B;AAC9D,KAAI,CAAC,OAAO,WAAW,OAAO,QAAQ,WAAW,EAC7C,OAAM,IAAI,MAAM,uCAAuC;CAE3D,MAAM,iBAAiB,OAAO,QAAQ,KAAK,MAAM,yBAAyB,EAAE,CAAC,CAAC,KAAK,IAAI;CACvF,MAAM,UAAU,OAAO,GAAG,sBAAsB,eAAe;AAC/D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;AAkBnC,SAAgB,6BAAqC;CACjD,MAAM,gBAAgB,OAAO,GAAG;CAChC,MAAM,UAAU,OAAO,GAAG,sBAAsB,cAAc;AAC9D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;;;AAoBnC,SAAgB,uBAAuB,QAA+D;CAClG,MAAM,aAAa,mBAAmB,OAAO;CAC7C,MAAM,aAAa,mBAAmB,OAAO,cAAiD,OAAO;CACrG,MAAM,gBAAgB,OAAO,GAAG,GAAG,WAAW,WAAW;CACzD,MAAM,UAAU,OAAO,GAAG,sBAAsB,cAAc;AAC9D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;;;AAoBnC,SAAgB,sBAAsB,QAA+D;CACjG,MAAM,aAAa,mBAAmB,OAAO;CAC7C,MAAM,aAAa,mBAAmB,OAAO,cAAiD,OAAO;CACrG,MAAM,gBAAgB,OAAO,GAAG,GAAG,WAAW,2CAA2C;CACzF,MAAM,UAAU,OAAO,GAAG,sBAAsB,cAAc;AAC9D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;;;;;;AAuBnC,SAAgB,sBAAsB,QAA+D;CACjG,MAAM,aAAa,mBAAmB,OAAO;CAC7C,MAAM,aAAa,mBAAmB,OAAO,cAAiD,OAAO;CACrG,MAAM,gBAAgB,OAAO,GAAG,GAAG,WAAW,YAAY,WAAW,OAAO;CAC5E,MAAM,UAAU,OAAO,GAAG,sBAAsB,cAAc;AAC9D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAyCnC,SAAgB,sBACZ,QACM;AACN,KAAI,OAAO,QAAQ;EACf,MAAM,WAAW,OAAO,SAAS,eAAe,OAAO,OAAO,GAAG;EACjE,MAAM,kBAAkB,eAAe,OAAO,QAAQ,EAClD,QAAQ,UACX,CAAC;AACF,SAAO,IAAI,OAAO,iBAAiB,IAAI;;CAG3C,MAAM,aAAa,mBAAmB,OAAO;CAC7C,MAAM,YAAY,OAAO;CACzB,MAAM,aACF,cAAc,SAAU,mBAAmB,cAAiD,YAAa;CAE7G,MAAM,gBAAgB,aAAa,OAAO,GAAG,GAAG,WAAW,KAAK,eAAe;CAC/E,MAAM,UAAU,OAAO,GAAG,sBAAsB,cAAc;AAE9D,QAAO,IAAI,OAAO,SAAS,IAAI;;;;;;;;;;;;;;;;;;AAmBnC,SAAgB,sBAA8B;AAE1C,QAAO,IAAI,OADK,0DACW,IAAI;;;;;;;;;;;;;;;;;;;AAoBnC,SAAgB,uBAA+B;AAE3C,QAAO,IAAI,OADK,oDACW,IAAI;;;;;;;;;;;;;;;;;;;;;;;;ACzSnC,SAAgB,wBAAwB,QAA8B;CAElE,MAAMC,aAAqC;EACvC,WAAW,OAAO,aAAa;EAC/B,WAAW,OAAO,aAAa;EAC/B,GAAG;EACN;AAID,SAAQ,WAAW,MAAnB;EACI,KAAK,UACD,QAAO,qBAAqB,WAAW;EAC3C,KAAK,MACD,QAAO,kBAAkB;EAC7B,KAAK,eACD,QAAO,yBAAyB,WAAW;EAC/C,KAAK,UACD,QAAO,sBAAsB;EACjC,KAAK,SACD,QAAO,oBAAoB,WAAW;EAC1C,KAAK,iBACD,QAAO,4BAA4B;EACvC,KAAK,aACD,QAAO,uBAAuB,WAAW;EAC7C,KAAK,YACD,QAAO,sBAAsB,WAAW;EAC5C,KAAK,YACD,QAAO,sBAAsB,WAAW;EAC5C,KAAK,WACD,QAAO,sBAAsB,WAAW;EAC5C,KAAK,SACD,QAAO,qBAAqB;EAChC,KAAK,UACD,QAAO,sBAAsB;EACjC,SAAS;GAEL,MAAMC,cAAqB,WAAW;AACtC,SAAM,IAAI,MAAM,wBAAwB,cAAc"}
|
package/package.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{
|
|
2
|
+
"author": "Ragaeeb Haq",
|
|
3
|
+
"bugs": {
|
|
4
|
+
"url": "https://github.com/ragaeeb/flappa-doormal/issues"
|
|
5
|
+
},
|
|
6
|
+
"dependencies": {
|
|
7
|
+
"bitaboom": "^2.1.0"
|
|
8
|
+
},
|
|
9
|
+
"description": "Arabic text marker pattern library for generating regex from declarative configurations",
|
|
10
|
+
"devDependencies": {
|
|
11
|
+
"@biomejs/biome": "2.3.7",
|
|
12
|
+
"@types/bun": "latest",
|
|
13
|
+
"tsdown": "^0.16.7",
|
|
14
|
+
"typescript": "^5.9.3"
|
|
15
|
+
},
|
|
16
|
+
"exports": {
|
|
17
|
+
".": {
|
|
18
|
+
"import": "./dist/index.mjs",
|
|
19
|
+
"types": "./dist/index.d.mts"
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"files": [
|
|
23
|
+
"dist",
|
|
24
|
+
"README.md"
|
|
25
|
+
],
|
|
26
|
+
"homepage": "https://github.com/ragaeeb/flappa-doormal#readme",
|
|
27
|
+
"keywords": [
|
|
28
|
+
"arabic",
|
|
29
|
+
"text-processing",
|
|
30
|
+
"hadith",
|
|
31
|
+
"islamic-texts",
|
|
32
|
+
"nlp",
|
|
33
|
+
"regex",
|
|
34
|
+
"pattern-matching",
|
|
35
|
+
"marker-detection"
|
|
36
|
+
],
|
|
37
|
+
"license": "MIT",
|
|
38
|
+
"module": "./dist/index.mjs",
|
|
39
|
+
"name": "flappa-doormal",
|
|
40
|
+
"repository": {
|
|
41
|
+
"type": "git",
|
|
42
|
+
"url": "https://github.com/ragaeeb/flappa-doormal.git"
|
|
43
|
+
},
|
|
44
|
+
"scripts": {
|
|
45
|
+
"build": "tsdown"
|
|
46
|
+
},
|
|
47
|
+
"type": "module",
|
|
48
|
+
"types": "./dist/index.d.mts",
|
|
49
|
+
"version": "1.0.0"
|
|
50
|
+
}
|