bitaboom 1.5.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +149 -818
- package/dist/index.d.ts +68 -112
- package/dist/index.js +7 -7
- package/dist/index.js.map +1 -1
- package/package.json +10 -11
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
//#region src/arabic.d.ts
|
|
1
2
|
/**
|
|
2
3
|
* Converts Arabic-Indic numerals (٠-٩) to a JavaScript number.
|
|
3
4
|
*
|
|
@@ -44,6 +45,24 @@ declare const convertUrduSymbolsToArabic: (text: string) => string;
|
|
|
44
45
|
* @returns A decimal between 0-1 representing the Arabic character ratio (0 = no Arabic, 1 = all Arabic)
|
|
45
46
|
*/
|
|
46
47
|
declare const getArabicScore: (text: string) => number;
|
|
48
|
+
/**
|
|
49
|
+
* Finds the position of the last punctuation character in a string
|
|
50
|
+
*
|
|
51
|
+
* @param text - The text to search through
|
|
52
|
+
* @returns The index of the last punctuation character, or -1 if none found
|
|
53
|
+
*
|
|
54
|
+
* @example
|
|
55
|
+
* ```typescript
|
|
56
|
+
* const text = "Hello world! How are you?";
|
|
57
|
+
* const lastPuncIndex = findLastPunctuation(text);
|
|
58
|
+
* // Result: 24 (position of the last '?')
|
|
59
|
+
*
|
|
60
|
+
* const noPuncText = "Hello world";
|
|
61
|
+
* const notFound = findLastPunctuation(noPuncText);
|
|
62
|
+
* // Result: -1 (no punctuation found)
|
|
63
|
+
* ```
|
|
64
|
+
*/
|
|
65
|
+
declare const findLastPunctuation: (text: string) => number;
|
|
47
66
|
/**
|
|
48
67
|
* Fixes the trailing "و" (wow) in phrases such as "عليكم و رحمة" to "عليكم ورحمة".
|
|
49
68
|
* This function attempts to correct phrases where "و" appears unnecessarily, particularly in greetings.
|
|
@@ -87,81 +106,8 @@ declare const removeSolitaryArabicLetters: (text: string) => string;
|
|
|
87
106
|
* @returns {string} - The modified text with English punctuation replaced by Arabic punctuation.
|
|
88
107
|
*/
|
|
89
108
|
declare const replaceEnglishPunctuationWithArabic: (text: string) => string;
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
* Ultra-fast Arabic text sanitizer for search/indexing/display.
|
|
93
|
-
* Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
|
|
94
|
-
* Options can merge over a base preset or `'none'` to apply exactly the rules you request.
|
|
95
|
-
*/
|
|
96
|
-
type SanitizePreset = 'light' | 'search' | 'aggressive';
|
|
97
|
-
type SanitizeBase = 'none' | SanitizePreset;
|
|
98
|
-
/**
|
|
99
|
-
* Public options for {@link sanitizeArabic}. When you pass an options object, it overlays the chosen
|
|
100
|
-
* `base` (default `'light'`) without allocating merged objects on the hot path; flags are resolved
|
|
101
|
-
* directly into local booleans for speed.
|
|
102
|
-
*/
|
|
103
|
-
type SanitizeOptions = {
|
|
104
|
-
/** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
|
|
105
|
-
base?: SanitizeBase;
|
|
106
|
-
/** Unicode NFC normalization. Default: `true` in all presets. */
|
|
107
|
-
nfc?: boolean;
|
|
108
|
-
/** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
|
|
109
|
-
stripZeroWidth?: boolean;
|
|
110
|
-
/** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
|
|
111
|
-
zeroWidthToSpace?: boolean;
|
|
112
|
-
/** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
|
|
113
|
-
stripDiacritics?: boolean;
|
|
114
|
-
/**
|
|
115
|
-
* Remove tatweel (ـ).
|
|
116
|
-
* - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
|
|
117
|
-
* - `'safe'` or `'all'` explicitly
|
|
118
|
-
* - `false` to keep tatweel
|
|
119
|
-
* Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
|
|
120
|
-
*/
|
|
121
|
-
stripTatweel?: boolean | 'safe' | 'all';
|
|
122
|
-
/** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
|
|
123
|
-
normalizeAlif?: boolean;
|
|
124
|
-
/** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
|
|
125
|
-
replaceAlifMaqsurah?: boolean;
|
|
126
|
-
/** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
|
|
127
|
-
replaceTaMarbutahWithHa?: boolean;
|
|
128
|
-
/** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
|
|
129
|
-
stripLatinAndSymbols?: boolean;
|
|
130
|
-
/** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
|
|
131
|
-
keepOnlyArabicLetters?: boolean;
|
|
132
|
-
/** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
|
|
133
|
-
lettersAndSpacesOnly?: boolean;
|
|
134
|
-
/** Collapse runs of whitespace to a single space. Default: `true`. */
|
|
135
|
-
collapseWhitespace?: boolean;
|
|
136
|
-
/** Trim leading/trailing whitespace. Default: `true`. */
|
|
137
|
-
trim?: boolean;
|
|
138
|
-
/**
|
|
139
|
-
* Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
|
|
140
|
-
* (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
|
|
141
|
-
* Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
|
|
142
|
-
*/
|
|
143
|
-
removeHijriMarker?: boolean;
|
|
144
|
-
};
|
|
145
|
-
/**
|
|
146
|
-
* Sanitizes Arabic text according to a preset or custom options.
|
|
147
|
-
*
|
|
148
|
-
* Presets:
|
|
149
|
-
* - `'light'`: NFC, zero-width removal, collapse/trim spaces.
|
|
150
|
-
* - `'search'`: removes diacritics and tatweel, normalizes Alif and ى→ي, removes Hijri marker.
|
|
151
|
-
* - `'aggressive'`: ideal for FTS; keeps letters+spaces only and strips common noise.
|
|
152
|
-
*
|
|
153
|
-
* Custom options:
|
|
154
|
-
* - Passing an options object overlays the selected `base` preset (default `'light'`).
|
|
155
|
-
* - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
|
|
156
|
-
*
|
|
157
|
-
* Examples:
|
|
158
|
-
* ```ts
|
|
159
|
-
* sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ'
|
|
160
|
-
* sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29'
|
|
161
|
-
* sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم'
|
|
162
|
-
* ```
|
|
163
|
-
*/
|
|
164
|
-
declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions) => string;
|
|
109
|
+
//#endregion
|
|
110
|
+
//#region src/cleaning.d.ts
|
|
165
111
|
/**
|
|
166
112
|
* Escape a string so it can be safely embedded into a RegExp source.
|
|
167
113
|
*
|
|
@@ -171,40 +117,40 @@ declare const sanitizeArabic: (input: string, optionsOrPreset?: SanitizePreset |
|
|
|
171
117
|
declare const escapeRegex: (s: string) => string;
|
|
172
118
|
/** Optional equivalence toggles for {@link makeDiacriticInsensitiveRegex}. */
|
|
173
119
|
type EquivOptions = {
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
120
|
+
/** Treat ا/أ/إ/آ as equivalent. @default true */
|
|
121
|
+
alif?: boolean;
|
|
122
|
+
/** Treat ة/ه as equivalent. @default true */
|
|
123
|
+
taMarbutahHa?: boolean;
|
|
124
|
+
/** Treat ى/ي as equivalent. @default true */
|
|
125
|
+
alifMaqsurahYa?: boolean;
|
|
180
126
|
};
|
|
181
127
|
/** Options for {@link makeDiacriticInsensitiveRegex}. */
|
|
182
128
|
type MakeRegexOptions = {
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
129
|
+
/**
|
|
130
|
+
* Character equivalences to allow.
|
|
131
|
+
* @default { alif: true, taMarbutahHa: true, alifMaqsurahYa: true }
|
|
132
|
+
*/
|
|
133
|
+
equivalences?: EquivOptions;
|
|
134
|
+
/**
|
|
135
|
+
* Allow tatweel between letters (tolerate decorative elongation).
|
|
136
|
+
* @default true
|
|
137
|
+
*/
|
|
138
|
+
allowTatweel?: boolean;
|
|
139
|
+
/**
|
|
140
|
+
* Ignore diacritics by inserting a `DIACRITICS_CLASS*` after each letter.
|
|
141
|
+
* @default true
|
|
142
|
+
*/
|
|
143
|
+
ignoreDiacritics?: boolean;
|
|
144
|
+
/**
|
|
145
|
+
* Treat any whitespace in the needle as `\s+` for flexible matching.
|
|
146
|
+
* @default true
|
|
147
|
+
*/
|
|
148
|
+
flexWhitespace?: boolean;
|
|
149
|
+
/**
|
|
150
|
+
* RegExp flags to use.
|
|
151
|
+
* @default 'u'
|
|
152
|
+
*/
|
|
153
|
+
flags?: string;
|
|
208
154
|
};
|
|
209
155
|
/**
|
|
210
156
|
* Build a **diacritic-insensitive**, **tatweel-tolerant** RegExp for Arabic text matching.
|
|
@@ -225,7 +171,13 @@ type MakeRegexOptions = {
|
|
|
225
171
|
* rx.test('اَنا إلى الآفاق'); // true
|
|
226
172
|
*/
|
|
227
173
|
declare const makeDiacriticInsensitiveRegex: (needle: string, opts?: MakeRegexOptions) => RegExp;
|
|
228
|
-
|
|
174
|
+
declare const removeAllTags: (content: string) => string;
|
|
175
|
+
//#endregion
|
|
176
|
+
//#region src/constants.d.ts
|
|
177
|
+
/** Matches text ending with common punctuation marks */
|
|
178
|
+
declare const PATTERN_ENDS_WITH_PUNCTUATION: RegExp;
|
|
179
|
+
//#endregion
|
|
180
|
+
//#region src/formatting.d.ts
|
|
229
181
|
/**
|
|
230
182
|
* Adds line breaks after punctuation marks such as periods, exclamation points, and question marks.
|
|
231
183
|
* Example: 'Text.' becomes 'Text.\n'.
|
|
@@ -470,7 +422,8 @@ declare const toTitleCase: (str: string) => string;
|
|
|
470
422
|
* @returns {string} - The modified text with spaces removed inside quotes.
|
|
471
423
|
*/
|
|
472
424
|
declare const trimSpaceInsideQuotes: (text: string) => string;
|
|
473
|
-
|
|
425
|
+
//#endregion
|
|
426
|
+
//#region src/parsing.d.ts
|
|
474
427
|
/**
|
|
475
428
|
* Converts a string that resembles JSON but with numeric keys and single-quoted values
|
|
476
429
|
* into valid JSON format. This function replaces numeric keys with quoted numeric keys
|
|
@@ -540,7 +493,8 @@ declare const isBalanced: (str: string) => boolean;
|
|
|
540
493
|
* @throws Error when start page exceeds end page in range
|
|
541
494
|
*/
|
|
542
495
|
declare const parsePageRanges: (pageInput: string) => number[];
|
|
543
|
-
|
|
496
|
+
//#endregion
|
|
497
|
+
//#region src/sanitization.d.ts
|
|
544
498
|
/**
|
|
545
499
|
* Removes various symbols, part references, and numerical markers from the text.
|
|
546
500
|
* Example: '(1) (2/3)' becomes ''.
|
|
@@ -669,7 +623,8 @@ declare const unescapeSpaces: (input: string) => string;
|
|
|
669
623
|
* @returns Regex pattern string that matches the text with or without diacritics and character variants
|
|
670
624
|
*/
|
|
671
625
|
declare const makeDiacriticInsensitive: (text: string) => string;
|
|
672
|
-
|
|
626
|
+
//#endregion
|
|
627
|
+
//#region src/transliteration.d.ts
|
|
673
628
|
/**
|
|
674
629
|
* Replaces common Arabic prefixes (like 'Al-', 'Ar-', 'Ash-', etc.) with 'al-' in the text.
|
|
675
630
|
* Handles different variations of prefixes such as Ash- and Al- but not when the second word
|
|
@@ -729,5 +684,6 @@ declare const normalizeTransliteratedEnglish: (text: string) => string;
|
|
|
729
684
|
* @returns {string} - The extracted initials.
|
|
730
685
|
*/
|
|
731
686
|
declare const extractInitials: (fullName: string) => string;
|
|
732
|
-
|
|
733
|
-
export {
|
|
687
|
+
//#endregion
|
|
688
|
+
export { MakeRegexOptions, PATTERN_ENDS_WITH_PUNCTUATION, addSpaceBeforeAndAfterPunctuation, addSpaceBetweenArabicTextAndNumbers, applySmartQuotes, arabicNumeralToNumber, cleanExtremeArabicUnderscores, cleanLiteralNewLines, cleanMultilines, cleanSpacesBeforePeriod, cleanSymbolsAndPartReferences, cleanTrailingPageNumbers, condenseAsterisks, condenseColons, condenseDashes, condenseEllipsis, condensePeriods, condenseUnderscores, convertUrduSymbolsToArabic, doubleToSingleBrackets, ensureSpaceBeforeBrackets, ensureSpaceBeforeQuotes, escapeRegex, extractInitials, findLastPunctuation, fixBracketTypos, fixCurlyBraces, fixMismatchedQuotationMarks, fixTrailingWow, formatStringBySentence, getArabicScore, hasWordInSingleLine, insertLineBreaksAfterPunctuation, isAllUppercase, isBalanced, isJsonStructureValid, isOnlyPunctuation, makeDiacriticInsensitive, makeDiacriticInsensitiveRegex, normalize, normalizeArabicPrefixesToAl, normalizeDoubleApostrophes, normalizeJsonSyntax, normalizeSlashInReferences, normalizeSpaces, normalizeTransliteratedEnglish, parsePageRanges, reduceMultilineBreaksToDouble, reduceMultilineBreaksToSingle, removeAllTags, removeArabicPrefixes, removeDeathYear, removeMarkdownFormatting, removeNonIndexSignatures, removeNumbersAndDashes, removeRedundantPunctuation, removeSingleDigitReferences, removeSingularCodes, removeSolitaryArabicLetters, removeSpaceInsideBrackets, removeUrls, replaceDoubleBracketsWithArrows, replaceEnglishPunctuationWithArabic, replaceLineBreaksWithSpaces, replaceSalutationsWithSymbol, splitByQuotes, stripAllDigits, stripBoldStyling, stripItalicsStyling, stripStyling, toTitleCase, trimSpaceInsideQuotes, truncate, truncateMiddle, unescapeSpaces };
|
|
689
|
+
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
const e=/[.!?؟؛]$/,t=e=>parseInt(e.replace(/[\u0660-\u0669]/g,e=>(e.charCodeAt(0)-1632).toString()),10),n=e=>e.replace(/(?<!\d ?ه|اه)ـ(?=\r?$)|^ـ(?!اهـ)/gm,``),r=e=>e.replace(/ھ/g,`ه`).replace(/ی/g,`ي`),i=e=>{if(!e)return 0;let t=/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/g,n=/[0-9\u0660-\u0669\u06F0-\u06F9]/g,r=/[^\s0-9\u0660-\u0669\u06F0-\u06F9]/g,i=e.replace(n,``),a=i.match(t)||[],o=i.match(r)||[];return o.length===0?0:a.length/o.length},a=t=>{for(let n=t.length-1;n>=0;n--)if(e.test(t[n]))return n;return-1},o=e=>e.replace(/ و /g,` و`),s=e=>e.replace(/([\u0600-\u06FF]+)(\d+)/g,`$1 $2`),c=e=>e.replace(/(?<![0-9] ?)-|(?<=[\u0600-\u06FF])\s?\d\s?(?=[\u0600-\u06FF])/g,` `).replace(/(?<=[\u0600-\u06FF]\s)(\d+\s)+\d+(?=(\s[\u0600-\u06FF]|$))/g,` `),l=e=>e.replace(/[[({][\u0621-\u064A\u0660-\u0669][\])}]/g,``),u=e=>e.replace(/(^| )[\u0621-\u064A]( |$)/g,` `),d=e=>e.replace(/\?|؟\./g,`؟`).replace(/(;|؛)\s*(\1\s*)*/g,`؛`).replace(/,|-،/g,`،`),f=e=>e.replace(/[.*+?^${}()|[\]\\]/g,`\\$&`),p=(e,t={})=>{let{equivalences:n={alif:!0,taMarbutahHa:!0,alifMaqsurahYa:!0},allowTatweel:r=!0,ignoreDiacritics:i=!0,flexWhitespace:a=!0,flags:o=`u`}=t;if(e.length>5e3)throw Error(`makeDiacriticInsensitiveRegex: needle too long`);let s=e=>{switch(e){case`ا`:case`أ`:case`إ`:case`آ`:return n.alif?`[اأإآ]`:`ا`;case`ة`:case`ه`:return n.taMarbutahHa?`[هة]`:f(e);case`ى`:case`ي`:return n.alifMaqsurahYa?`[ىي]`:f(e);default:return f(e)}},c=`${i?`[\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06ED]*`:``}${r?`\\u0640*`:``}`,l=``;for(let t of Array.from(e))/\s/.test(t)?l+=a?`\\s+`:`\\s*`:l+=`${s(t)}${c}`;return new RegExp(l,o)},m=e=>e.replace(/<[^>]*>/g,``),h=e=>e.replace(/([.?!؟])/g,`$1
|
|
2
2
|
`).replace(/\n\s+/g,`
|
|
3
|
-
`).trim()
|
|
4
|
-
`),
|
|
3
|
+
`).trim(),ee=e=>e.replace(/( ?)([.!?,،؟;؛])((?![ '”“)"\]\n])|(?=\s{2,}))/g,`$1$2 `).replace(/\s([.!?,،؟;؛])\s*([ '”“)"\]\n])/g,`$1$2`).replace(/([^\s\w\d'”“)"\]]+)\s+([.!?,،؟;؛])|([.!?,،؟;؛])\s+$/g,`$1$2$3`).replace(/(?<=\D)( ?: ?)(?!(\d+:)|(:\d+))|(?<=\d) ?: ?(?=\D)|(?<=\D) ?: ?(?=\d)/g,`: `),te=e=>e.replace(/[“”]/g,`"`).replace(/"([^"]*)"/g,`“$1”`).replace(/^”/g,`“`),ne=e=>e.replace(/\\n|\r/g,`
|
|
4
|
+
`),g=e=>e.replace(/^ +| +$/gm,``),_=e=>/^\s*\S+\s*$/gm.test(e),v=e=>/^[\u0020-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e0-9٠-٩]+$/.test(e),y=e=>e.replace(/\s+([.؟!,،؛:?])/g,`$1`),b=e=>e.replace(/(\*\s*)+/g,`*`),x=e=>e.replace(/[.-]?:[.-]?/g,`:`),S=e=>e.replace(/-{2,}/g,`-`),C=e=>e.replace(/\.{2,}/g,`…`),w=e=>e.replace(/(\n\s*){3,}/g,`
|
|
5
5
|
|
|
6
|
-
`),
|
|
7
|
-
`),
|
|
8
|
-
`),
|
|
9
|
-
`)},
|
|
6
|
+
`),T=e=>e.replace(/(\n\s*){2,}/g,`
|
|
7
|
+
`),E=e=>e.replace(/\. +\./g,`.`),D=e=>e.replace(/ـ{2,}/g,`ـ`).replace(/_+/g,`_`),O=e=>e.replace(/(\(|\)){2,}|(\[|\]){2,}/g,`$1$2`),k=e=>e.replace(/(\S) *(\([^)]*\))/g,`$1 $2`),A=e=>e.replace(/(\S) *(«[^»]*»)/g,`$1 $2`),j=e=>e.replace(/\(«|\( \(/g,`«`).replace(/»\)|\) \)/g,`»`).replace(/\)([0-9\u0660-\u0669]+)\)/g,`($1)`).replace(/\)([0-9\u0660-\u0669]+)\(/g,`($1)`),M=e=>{let t=e;return t=t.replace(/\(([^(){}]+)\}/g,`{$1}`),t.replace(/\{([^(){}]+)\)/g,`{$1}`)},N=e=>e.replace(/«([^»)]+)\)/g,`«$1»`).replace(/\(([^()]+)»/g,`«$1»`).replace(/«([^»]+)(?=\s*$|$)/g,`«$1»`),P=e=>{let t=/^\((?:\d+|۱|۲|۳|۴|۵|۶|۷|۸|۹)\)\s/,n=[],r=e.split(`
|
|
8
|
+
`),i=``;return r.forEach(e=>{let r=e.trim(),a=t.test(r),o=/^\(\d+\/\d+\)/.test(r);if(a&&!o)i&&=(n.push(i.trim()),``),n.push(r);else{i+=`${r} `;let e=i.trim().slice(-1);/[.!؟]/.test(e)&&(n.push(i.trim()),i=``)}}),i&&n.push(i.trim()),n.join(`
|
|
9
|
+
`)},F=e=>{let t=e.replace(/[^\p{L}]/gu,``);return t.length===0?!1:t===t.toUpperCase()},I=e=>e.replace(/(\d+)\s?\/\s?(\d+)/g,`$1/$2`),L=e=>e.replace(/[ \t]+/g,` `),R=e=>e.replace(/([؟!])[.،]/g,`$1`),z=e=>e.replace(/([[(])\s*(.*?)\s*([\])])/g,`$1$2$3`),B=e=>e.replace(/\(\(\s?/g,`«`).replace(/\s?\)\)/g,`»`),V=e=>e.normalize(`NFKD`).replace(/[\u0300-\u036f]/g,``).trim(),H=e=>{let t={𝑎:`I`,𝑨:`g`,𝘼:`!`,𝑏:`J`,𝑩:`h`,𝘽:`?`,𝑐:`K`,𝑪:`i`,𝑑:`L`,𝑫:`j`,𝘿:`,`,𝑒:`M`,𝑬:`k`,𝙀:`.`,𝑓:`N`,𝑭:`l`,𝑔:`O`,𝑮:`m`,𝑯:`n`,𝑖:`Q`,𝑰:`o`,𝑗:`R`,𝑱:`p`,𝑘:`S`,𝑲:`q`,𝑙:`T`,𝑳:`r`,𝙇:`-`,𝑚:`U`,𝑴:`s`,𝑛:`V`,𝑵:`t`,𝑜:`W`,𝑶:`u`,𝑝:`X`,𝑷:`v`,𝑞:`Y`,𝑸:`w`,𝑟:`Z`,𝑹:`x`,𝑆:`A`,𝑺:`y`,𝑇:`B`,𝑻:`z`,𝑢:`a`,𝑈:`C`,𝑣:`b`,𝑉:`D`,𝑤:`c`,𝑊:`E`,𝑥:`d`,𝑋:`F`,𝑦:`e`,𝑌:`G`,𝑧:`f`,𝑍:`H`,"":`P`};return e.replace(/[\uD835\uDC62-\uD835\uDC7B\uD835\uDC46-\uD835\uDC5F\u{1D63C}-\u{1D647}]/gu,e=>t[e]||e)},U=e=>H(V(e)),W=e=>e.toLowerCase().split(` `).map(e=>{if(e.length===0)return e;let t=e.match(/\p{L}/u);if(!t||t.index===void 0)return e;let n=t.index;return e.slice(0,n)+e.charAt(n).toUpperCase()+e.slice(n+1)}).join(` `),G=e=>e.replace(/([“”"]|«) *(.*?) *([“”"]|»)/g,`$1$2$3`),K=e=>{let t=e.replace(/(\b\d+\b)(?=:)/g,`"$1"`);return t=t.replace(/:\s*'([^']+)'/g,`: "$1"`),t=t.replace(/:\s*"([^"]+)"/g,`: "$1"`),JSON.stringify(JSON.parse(t))},q=e=>/^{(\s*(\d+|'[^']*'|"[^"]*")\s*:\s*('|")[^'"]*\3\s*,)*(?:\s*(\d+|'[^']*'|"[^"]*")\s*:\s*('|")[^'"]*\5\s*)}$/.test(e.trim()),J=e=>(e.match(/(?:[^\s"]+|"(.*?)")+/g)||[]).map(e=>e.startsWith(`"`)?e.slice(1,-1):e),Y=e=>{let t=0;for(let n of e)n===`"`&&t++;return t%2==0},X={"(":`)`,"[":`]`,"{":`}`},re=new Set([`(`,`[`,`{`]),ie=new Set([`)`,`]`,`}`]),ae=e=>{let t=[];for(let n of e)if(re.has(n))t.push(n);else if(ie.has(n)){let e=t.pop();if(!e||X[e]!==n)return!1}return t.length===0},oe=e=>Y(e)&&ae(e),se=e=>{if(e.includes(`-`)){let[t,n]=e.split(`-`).map(Number);if(t>n)throw Error(`Start page cannot be greater than end page`);return Array.from({length:n-t+1},(e,n)=>t+n)}else return e.split(`,`).map(Number)},ce=e=>e.replace(/ *\(?:\d+(?:\/\d+){0,2}\)? *| *\[\d+(?:\/\d+)?\] *| *«\d+» *|\d+\/\d+(?:\/\d+)?|[،§{}؍﴿﴾<>;_؟»«:!،؛[\]…ـ¬.\\/*()"]/g,` `),le=e=>e.replace(/-\[\d+\]-/g,``),ue=e=>e.replace(/\s+/g,` `),de=e=>e.replace(/[0-9]/g,``),fe=e=>e.replace(/\[(d)\.\s*\d{1,4}[hH]\]\s*|\((d)\.\s*\d{1,4}[hH]\)\s*/g,``),pe=e=>e.replace(/[\d-]/g,``),me=e=>e.replace(/\(\d{1}\)|\[\d{1}\]|«\d»/g,``),he=e=>e.replace(/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)/g,``),ge=e=>e.replace(/\*\*([^*]+)\*\*/g,`$1`).replace(/__([^_]+)__/g,`$1`).replace(/\*([^*]+)\*/g,`$1`).replace(/_([^_]+)_/g,`$1`).replace(/~~([^~]+)~~/g,`$1`).replace(/^\s*>\s?/gm,``).replace(/!\[[^\]]*]\([^)]*\)/g,``).replace(/\[([^\]]+)]\([^)]*\)/g,`$1`).replace(/^#+\s*/gm,``).replace(/^\s*[-*+]\s+/gm,``).replace(/^\s*\d+\.\s+/gm,``).replace(/`/gm,``),_e=(e,t=150)=>e.length>t?`${e.substring(0,t-1)}…`:e,ve=(e,t=50,n)=>{if(e.length<=t)return e;let r=Math.max(3,Math.floor(t/3)),i=n??r,a=t-1-i;return a<1?`${e.substring(0,t-1)}…`:`${e.substring(0,a)}…${e.substring(e.length-i)}`},ye=e=>e.replace(/\\ /g,` `).trim(),be=[[`ا`,`آ`,`أ`,`إ`],[`ة`,`ه`],[`ى`,`ي`]],xe=e=>{for(let t of be)if(t.includes(e))return`[${t.map(e=>f(e)).join(``)}]`;return f(e)},Se=e=>e.normalize(`NFC`).replace(/[\u200C\u200D]/g,``).replace(/\s+/g,` `).trim(),Ce=e=>{let t=Se(e);return Array.from(t).map(e=>xe(e)+`[ًٌٍَُِّْ]*`).join(``)},we=e=>e.replace(/(\b|\W)(Al |Al-|Ar-|As-|Adh-|Ad-|Ats-|Ath |Ath-|Az |Az-|az-|adh-|as-|ar-)/g,`$1al-`).replace(/(\b|\W)(Ash-S|ash-S)/g,`$1al-S`).replace(/al- (.+?)\b/g,`al-$1`),Te=e=>e.replace(/ʿʿ/g,`ʿ`).replace(/ʾʾ/g,`ʾ`),Ee=e=>e.replace(/\(peace be upon him\)|(Messenger of (Allah|Allāh)|Messenger|Prophet|Mu[hḥ]ammad) *\((s[^)]*m|peace[^)]*him|May[^)]*him|may[^)]*him)\)*/gi,`$1 ﷺ`).replace(/,\s*ﷺ\s*,/g,` ﷺ`),Z=e=>e.normalize(`NFKD`).replace(/[\u0300-\u036f]/g,``).replace(/`|ʾ|ʿ|-/g,``),Q=e=>L(e.replace(/(\bal-|\bli-|\bbi-|\bfī|\bwa[-\s]+|\bl-|\bliʿl|\Bʿalá|\Bʿan|\bb\.)/gi,``)),$=e=>Z(Q(e)),De=e=>$(e).trim().split(/[ -]/).slice(0,2).map(e=>e.charAt(0).toUpperCase()).join(``);export{e as PATTERN_ENDS_WITH_PUNCTUATION,ee as addSpaceBeforeAndAfterPunctuation,s as addSpaceBetweenArabicTextAndNumbers,te as applySmartQuotes,t as arabicNumeralToNumber,n as cleanExtremeArabicUnderscores,ne as cleanLiteralNewLines,g as cleanMultilines,y as cleanSpacesBeforePeriod,ce as cleanSymbolsAndPartReferences,le as cleanTrailingPageNumbers,b as condenseAsterisks,x as condenseColons,S as condenseDashes,C as condenseEllipsis,E as condensePeriods,D as condenseUnderscores,r as convertUrduSymbolsToArabic,O as doubleToSingleBrackets,k as ensureSpaceBeforeBrackets,A as ensureSpaceBeforeQuotes,f as escapeRegex,De as extractInitials,a as findLastPunctuation,j as fixBracketTypos,M as fixCurlyBraces,N as fixMismatchedQuotationMarks,o as fixTrailingWow,P as formatStringBySentence,i as getArabicScore,_ as hasWordInSingleLine,h as insertLineBreaksAfterPunctuation,F as isAllUppercase,oe as isBalanced,q as isJsonStructureValid,v as isOnlyPunctuation,Ce as makeDiacriticInsensitive,p as makeDiacriticInsensitiveRegex,Z as normalize,we as normalizeArabicPrefixesToAl,Te as normalizeDoubleApostrophes,K as normalizeJsonSyntax,I as normalizeSlashInReferences,L as normalizeSpaces,$ as normalizeTransliteratedEnglish,se as parsePageRanges,w as reduceMultilineBreaksToDouble,T as reduceMultilineBreaksToSingle,m as removeAllTags,Q as removeArabicPrefixes,fe as removeDeathYear,ge as removeMarkdownFormatting,c as removeNonIndexSignatures,pe as removeNumbersAndDashes,R as removeRedundantPunctuation,me as removeSingleDigitReferences,l as removeSingularCodes,u as removeSolitaryArabicLetters,z as removeSpaceInsideBrackets,he as removeUrls,B as replaceDoubleBracketsWithArrows,d as replaceEnglishPunctuationWithArabic,ue as replaceLineBreaksWithSpaces,Ee as replaceSalutationsWithSymbol,J as splitByQuotes,de as stripAllDigits,V as stripBoldStyling,H as stripItalicsStyling,U as stripStyling,W as toTitleCase,G as trimSpaceInsideQuotes,_e as truncate,ve as truncateMiddle,ye as unescapeSpaces};
|
|
10
10
|
//# sourceMappingURL=index.js.map
|