SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
@@ -0,0 +1,818 @@
|
|
1
|
+
import html
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
import pyarabic.araby as araby
|
7
|
+
|
8
|
+
ACCEPTED_MODELS = [
|
9
|
+
"bert-base-arabertv01",
|
10
|
+
"bert-base-arabert",
|
11
|
+
"bert-base-arabertv02",
|
12
|
+
"bert-base-arabertv2",
|
13
|
+
"bert-large-arabertv02",
|
14
|
+
"bert-large-arabertv2",
|
15
|
+
"araelectra-base",
|
16
|
+
"araelectra-base-discriminator",
|
17
|
+
"araelectra-base-generator",
|
18
|
+
"araelectra-base-artydiqa",
|
19
|
+
"aragpt2-base",
|
20
|
+
"aragpt2-medium",
|
21
|
+
"aragpt2-large",
|
22
|
+
"aragpt2-mega",
|
23
|
+
]
|
24
|
+
|
25
|
+
SEGMENTED_MODELS = [
|
26
|
+
"bert-base-arabert",
|
27
|
+
"bert-base-arabertv2",
|
28
|
+
"bert-large-arabertv2",
|
29
|
+
]
|
30
|
+
|
31
|
+
SECOND_GEN_MODELS = [
|
32
|
+
"bert-base-arabertv02",
|
33
|
+
"bert-base-arabertv2",
|
34
|
+
"bert-large-arabertv02",
|
35
|
+
"bert-large-arabertv2",
|
36
|
+
"araelectra-base",
|
37
|
+
"araelectra-base-discriminator",
|
38
|
+
"araelectra-base-generator",
|
39
|
+
"araelectra-base-artydiqa",
|
40
|
+
"aragpt2-base",
|
41
|
+
"aragpt2-medium",
|
42
|
+
"aragpt2-large",
|
43
|
+
"aragpt2-mega",
|
44
|
+
]
|
45
|
+
|
46
|
+
|
47
|
+
class ArabertPreprocessor:
|
48
|
+
"""
|
49
|
+
A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
|
50
|
+
It also can unprocess the text ouput of the generated text
|
51
|
+
|
52
|
+
Args:
|
53
|
+
|
54
|
+
model_name (:obj:`str`): model name from the HuggingFace Models page without
|
55
|
+
the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found.
|
56
|
+
Current accepted models are:
|
57
|
+
|
58
|
+
- "bert-base-arabertv01": No farasa segmentation.
|
59
|
+
- "bert-base-arabert": with farasa segmentation.
|
60
|
+
- "bert-base-arabertv02": No farasas egmentation.
|
61
|
+
- "bert-base-arabertv2": with farasa segmentation.
|
62
|
+
- "bert-large-arabertv02": No farasas egmentation.
|
63
|
+
- "bert-large-arabertv2": with farasa segmentation.
|
64
|
+
- "araelectra-base": No farasa segmentation.
|
65
|
+
- "araelectra-base-discriminator": No farasa segmentation.
|
66
|
+
- "araelectra-base-generator": No farasa segmentation.
|
67
|
+
- "aragpt2-base": No farasa segmentation.
|
68
|
+
- "aragpt2-medium": No farasa segmentation.
|
69
|
+
- "aragpt2-large": No farasa segmentation.
|
70
|
+
- "aragpt2-mega": No farasa segmentation.
|
71
|
+
|
72
|
+
|
73
|
+
keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing.
|
74
|
+
|
75
|
+
remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts,
|
76
|
+
should be set to False when preprocessing TyDi QA.
|
77
|
+
|
78
|
+
replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls
|
79
|
+
and mentions by special tokens.
|
80
|
+
|
81
|
+
strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA,
|
82
|
+
KASRA, SUKUN, SHADDA).
|
83
|
+
|
84
|
+
strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'.
|
85
|
+
|
86
|
+
insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits
|
87
|
+
or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace
|
88
|
+
between words and numbers or numbers and words.
|
89
|
+
|
90
|
+
remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with
|
91
|
+
2 of this character.
|
92
|
+
|
93
|
+
replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02,
|
94
|
+
AraELECTRA and AraGPT2.
|
95
|
+
Set to False to force disable, and True to force enable. Replaces the "/" with "-",
|
96
|
+
since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary.
|
97
|
+
|
98
|
+
map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
|
99
|
+
AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable.
|
100
|
+
Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995".
|
101
|
+
This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation),
|
102
|
+
and fixes the issue of caused by a bug when inserting white spaces.
|
103
|
+
|
104
|
+
apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
|
105
|
+
AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable.
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
|
111
|
+
ArabertPreprocessor: A preprocessor instance
|
112
|
+
|
113
|
+
Example:
|
114
|
+
|
115
|
+
from preprocess import ArabertPreprocessor
|
116
|
+
|
117
|
+
arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")
|
118
|
+
|
119
|
+
arabert_prep.preprocess("SOME ARABIC TEXT")
|
120
|
+
"""
|
121
|
+
|
122
|
+
def __init__(
|
123
|
+
self,
|
124
|
+
model_name: str,
|
125
|
+
keep_emojis: bool = False,
|
126
|
+
remove_html_markup: bool = True,
|
127
|
+
replace_urls_emails_mentions: bool = True,
|
128
|
+
strip_tashkeel: bool = True,
|
129
|
+
strip_tatweel: bool = True,
|
130
|
+
insert_white_spaces: bool = True,
|
131
|
+
remove_non_digit_repetition: bool = True,
|
132
|
+
replace_slash_with_dash: bool = None,
|
133
|
+
map_hindi_numbers_to_arabic: bool = None,
|
134
|
+
apply_farasa_segmentation: bool = None,
|
135
|
+
):
|
136
|
+
"""
|
137
|
+
A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
|
138
|
+
It also can unprocess the text ouput of the generated text
|
139
|
+
|
140
|
+
Args:
|
141
|
+
|
142
|
+
model_name (:obj:`str`): model name from the HuggingFace Models page without
|
143
|
+
the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found.
|
144
|
+
Current accepted models are:
|
145
|
+
|
146
|
+
- :obj:`"bert-base-arabertv01"`: No farasa segmentation.
|
147
|
+
- :obj:`"bert-base-arabert"`: with farasa segmentation.
|
148
|
+
- :obj:`"bert-base-arabertv02"`: No farasas egmentation.
|
149
|
+
- :obj:`"bert-base-arabertv2"`: with farasa segmentation.
|
150
|
+
- :obj:`"bert-large-arabertv02"`: No farasas egmentation.
|
151
|
+
- :obj:`"bert-large-arabertv2"`: with farasa segmentation.
|
152
|
+
- :obj:`"araelectra-base"`: No farasa segmentation.
|
153
|
+
- :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
|
154
|
+
- :obj:`"araelectra-base-generator"`: No farasa segmentation.
|
155
|
+
- :obj:`"aragpt2-base"`: No farasa segmentation.
|
156
|
+
- :obj:`"aragpt2-medium"`: No farasa segmentation.
|
157
|
+
- :obj:`"aragpt2-large"`: No farasa segmentation.
|
158
|
+
- :obj:`"aragpt2-mega"`: No farasa segmentation.
|
159
|
+
|
160
|
+
keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing.
|
161
|
+
|
162
|
+
remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts,
|
163
|
+
should be set to False when preprocessing TyDi QA.
|
164
|
+
|
165
|
+
replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls
|
166
|
+
and mentions by special tokens.
|
167
|
+
|
168
|
+
strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA,
|
169
|
+
KASRA, SUKUN, SHADDA).
|
170
|
+
|
171
|
+
strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'.
|
172
|
+
|
173
|
+
insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits
|
174
|
+
or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace
|
175
|
+
between words and numbers or numbers and words.
|
176
|
+
|
177
|
+
remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with
|
178
|
+
2 of this character.
|
179
|
+
|
180
|
+
replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02,
|
181
|
+
AraELECTRA and AraGPT2.
|
182
|
+
Set to False to force disable, and True to force enable. Replaces the "/" with "-",
|
183
|
+
since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary.
|
184
|
+
|
185
|
+
map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
|
186
|
+
AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable.
|
187
|
+
Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995".
|
188
|
+
This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation),
|
189
|
+
and fixes the issue of caused by a bug when inserting white spaces.
|
190
|
+
|
191
|
+
apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
|
192
|
+
AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
|
196
|
+
ArabertPreprocessor: A preprocessor instance
|
197
|
+
|
198
|
+
Example:
|
199
|
+
|
200
|
+
from preprocess import ArabertPreprocessor
|
201
|
+
|
202
|
+
arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")
|
203
|
+
|
204
|
+
arabert_prep.preprocess("SOME ARABIC TEXT")
|
205
|
+
|
206
|
+
"""
|
207
|
+
model_name = model_name.replace("aubmindlab/", "").replace("wissamantoun/", "")
|
208
|
+
|
209
|
+
if model_name not in ACCEPTED_MODELS:
|
210
|
+
logging.warning(
|
211
|
+
"""Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor"""
|
212
|
+
)
|
213
|
+
self.model_name = "bert-base-arabertv02"
|
214
|
+
else:
|
215
|
+
self.model_name = model_name
|
216
|
+
|
217
|
+
if apply_farasa_segmentation is None:
|
218
|
+
if self.model_name in SEGMENTED_MODELS:
|
219
|
+
self.apply_farasa_segmentation = True
|
220
|
+
else:
|
221
|
+
self.apply_farasa_segmentation = False
|
222
|
+
else:
|
223
|
+
if apply_farasa_segmentation == False and self.apply_farasa_segmentation:
|
224
|
+
logging.warning(
|
225
|
+
"The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!"
|
226
|
+
)
|
227
|
+
|
228
|
+
self.apply_farasa_segmentation = apply_farasa_segmentation
|
229
|
+
|
230
|
+
if self.apply_farasa_segmentation:
|
231
|
+
try:
|
232
|
+
from farasa.segmenter import FarasaSegmenter
|
233
|
+
|
234
|
+
self.farasa_segmenter = FarasaSegmenter(interactive=True)
|
235
|
+
except ModuleNotFoundError:
|
236
|
+
logging.error(
|
237
|
+
"farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
|
238
|
+
)
|
239
|
+
|
240
|
+
self.keep_emojis = keep_emojis
|
241
|
+
if self.keep_emojis:
|
242
|
+
import emoji
|
243
|
+
|
244
|
+
self.emoji = emoji
|
245
|
+
if self.apply_farasa_segmentation:
|
246
|
+
logging.warning(
|
247
|
+
"Keeping tweets with Farasa Segmentation is 10 times slower"
|
248
|
+
)
|
249
|
+
|
250
|
+
self.remove_html_markup = remove_html_markup
|
251
|
+
self.replace_urls_emails_mentions = replace_urls_emails_mentions
|
252
|
+
self.strip_tashkeel = strip_tashkeel
|
253
|
+
self.strip_tatweel = strip_tatweel
|
254
|
+
self.insert_white_spaces = insert_white_spaces
|
255
|
+
self.remove_non_digit_repetition = remove_non_digit_repetition
|
256
|
+
|
257
|
+
if replace_slash_with_dash is None:
|
258
|
+
if self.model_name in SECOND_GEN_MODELS:
|
259
|
+
self.replace_slash_with_dash = True
|
260
|
+
else:
|
261
|
+
self.replace_slash_with_dash = False
|
262
|
+
else:
|
263
|
+
self.replace_slash_with_dash = replace_slash_with_dash
|
264
|
+
|
265
|
+
if map_hindi_numbers_to_arabic is None:
|
266
|
+
if self.model_name in SECOND_GEN_MODELS:
|
267
|
+
self.map_hindi_numbers_to_arabic = True
|
268
|
+
else:
|
269
|
+
self.map_hindi_numbers_to_arabic = False
|
270
|
+
else:
|
271
|
+
self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic
|
272
|
+
|
273
|
+
def preprocess(self, text: str) -> str:
|
274
|
+
"""
|
275
|
+
Preprocess takes an input text line an applies the same preprocessing used in AraBERT
|
276
|
+
pretraining, or according to settings
|
277
|
+
|
278
|
+
Args:
|
279
|
+
|
280
|
+
text (:obj:`str`): inout text string
|
281
|
+
|
282
|
+
Returns:
|
283
|
+
|
284
|
+
string: A preprocessed string depending on which model was selected
|
285
|
+
"""
|
286
|
+
if (
|
287
|
+
self.model_name == "bert-base-arabert"
|
288
|
+
or self.model_name == "bert-base-arabertv01"
|
289
|
+
):
|
290
|
+
return self._preprocess_v1(
|
291
|
+
text,
|
292
|
+
do_farasa_tokenization=self.apply_farasa_segmentation,
|
293
|
+
)
|
294
|
+
|
295
|
+
if self.model_name in SECOND_GEN_MODELS:
|
296
|
+
return self._preprocess_v2(text)
|
297
|
+
|
298
|
+
return self._preprocess_v3(text)
|
299
|
+
|
300
|
+
def unpreprocess(self, text: str, desegment: bool = True) -> str:
|
301
|
+
"""Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
|
302
|
+
The objective is to make the generated text of any model appear natural and not preprocessed.
|
303
|
+
|
304
|
+
Args:
|
305
|
+
text (:obj:`str`): input text to be un-preprocessed
|
306
|
+
desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..
|
307
|
+
|
308
|
+
Returns:
|
309
|
+
str: The unpreprocessed (and possibly Farasa-desegmented) text.
|
310
|
+
"""
|
311
|
+
|
312
|
+
if self.apply_farasa_segmentation and desegment:
|
313
|
+
text = self.desegment(text)
|
314
|
+
|
315
|
+
# removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
|
316
|
+
# https://stackoverflow.com/a/53436792/5381220
|
317
|
+
text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
|
318
|
+
text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
|
319
|
+
text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
|
320
|
+
text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)
|
321
|
+
|
322
|
+
# during generation, sometimes the models don't put a space after the dot, this handles it
|
323
|
+
text = text.replace(".", " . ")
|
324
|
+
text = " ".join(text.split())
|
325
|
+
|
326
|
+
# handle decimals
|
327
|
+
text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
|
328
|
+
text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)
|
329
|
+
|
330
|
+
text = re.sub(left_and_right_spaced_chars, r"\1", text)
|
331
|
+
text = re.sub(left_spaced_chars, r"\1", text)
|
332
|
+
text = re.sub(right_spaced_chars, r"\1", text)
|
333
|
+
|
334
|
+
return text
|
335
|
+
|
336
|
+
def desegment(self, text: str) -> str:
|
337
|
+
"""
|
338
|
+
Use this function if sentence tokenization was done using
|
339
|
+
`from arabert.preprocess_arabert import preprocess` with Farasa enabled
|
340
|
+
AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
|
341
|
+
and after before the '+' for suffixes
|
342
|
+
|
343
|
+
Example:
|
344
|
+
>>> desegment('ال+ دراس +ات')
|
345
|
+
الدراسات
|
346
|
+
"""
|
347
|
+
text = text.replace("+ ", "+")
|
348
|
+
text = text.replace(" +", "+")
|
349
|
+
text = " ".join([self._desegmentword(word) for word in text.split(" ")])
|
350
|
+
return text
|
351
|
+
|
352
|
+
def _desegmentword(self, orig_word: str) -> str:
|
353
|
+
"""
|
354
|
+
Word segmentor that takes a Farasa Segmented Word and removes the '+' signs
|
355
|
+
|
356
|
+
Example:
|
357
|
+
>>> _desegmentword("ال+يومي+ة")
|
358
|
+
اليومية
|
359
|
+
"""
|
360
|
+
word = orig_word.replace("ل+ال+", "لل")
|
361
|
+
if "ال+ال" not in orig_word:
|
362
|
+
word = word.replace("ل+ال", "لل")
|
363
|
+
word = word.replace("+", "")
|
364
|
+
word = word.replace("للل", "لل")
|
365
|
+
return word
|
366
|
+
|
367
|
+
def _preprocess_v3(self, text: str) -> str:
|
368
|
+
text = str(text)
|
369
|
+
text = html.unescape(text)
|
370
|
+
if self.strip_tashkeel:
|
371
|
+
text = araby.strip_tashkeel(text)
|
372
|
+
if self.strip_tatweel:
|
373
|
+
text = araby.strip_tatweel(text)
|
374
|
+
|
375
|
+
if self.replace_urls_emails_mentions:
|
376
|
+
# replace all possible URLs
|
377
|
+
for reg in url_regexes:
|
378
|
+
text = re.sub(reg, " [رابط] ", text)
|
379
|
+
# REplace Emails with [بريد]
|
380
|
+
for reg in email_regexes:
|
381
|
+
text = re.sub(reg, " [بريد] ", text)
|
382
|
+
# replace mentions with [مستخدم]
|
383
|
+
text = re.sub(user_mention_regex, " [مستخدم] ", text)
|
384
|
+
|
385
|
+
if self.remove_html_markup:
|
386
|
+
# remove html line breaks
|
387
|
+
text = re.sub("<br />", " ", text)
|
388
|
+
# remove html markup
|
389
|
+
text = re.sub("</?[^>]+>", " ", text)
|
390
|
+
|
391
|
+
if self.map_hindi_numbers_to_arabic:
|
392
|
+
text = text.translate(hindi_to_arabic_map)
|
393
|
+
|
394
|
+
# remove repeated characters >2
|
395
|
+
if self.remove_non_digit_repetition:
|
396
|
+
text = self._remove_non_digit_repetition(text)
|
397
|
+
|
398
|
+
# insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
|
399
|
+
if self.insert_white_spaces:
|
400
|
+
text = re.sub(
|
401
|
+
"([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])",
|
402
|
+
r" \1 ",
|
403
|
+
text,
|
404
|
+
)
|
405
|
+
|
406
|
+
# re-fix brackets
|
407
|
+
text = text.replace("[ رابط ]", "[رابط]")
|
408
|
+
text = text.replace("[ بريد ]", "[بريد]")
|
409
|
+
text = text.replace("[ مستخدم ]", "[مستخدم]")
|
410
|
+
|
411
|
+
# insert whitespace between words and numbers or numbers and words
|
412
|
+
text = re.sub(
|
413
|
+
"(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)",
|
414
|
+
r" \1 \2 ",
|
415
|
+
text,
|
416
|
+
)
|
417
|
+
text = re.sub(
|
418
|
+
"([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)",
|
419
|
+
r" \1 \2 ",
|
420
|
+
text,
|
421
|
+
)
|
422
|
+
|
423
|
+
# remove unwanted characters
|
424
|
+
if self.keep_emojis:
|
425
|
+
emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
|
426
|
+
rejected_chars_regex2 = "[^%s%s]" % (chars_regexv2, emoji_regex)
|
427
|
+
text = re.sub(rejected_chars_regex2, " ", text)
|
428
|
+
else:
|
429
|
+
text = re.sub(rejected_chars_regexv2, " ", text)
|
430
|
+
|
431
|
+
# remove extra spaces
|
432
|
+
text = " ".join(text.replace("\uFE0F", "").split())
|
433
|
+
|
434
|
+
if self.apply_farasa_segmentation:
|
435
|
+
if self.keep_emojis:
|
436
|
+
new_text = []
|
437
|
+
for word in text.split():
|
438
|
+
if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
|
439
|
+
new_text.append(word)
|
440
|
+
else:
|
441
|
+
new_text.append(self.farasa_segmenter.segment(word))
|
442
|
+
text = " ".join(new_text)
|
443
|
+
else:
|
444
|
+
text = self.farasa_segmenter.segment(text)
|
445
|
+
return self._farasa_segment(text)
|
446
|
+
|
447
|
+
# ALl the other models dont require Farasa Segmentation
|
448
|
+
return text
|
449
|
+
|
450
|
+
def _preprocess_v2(self, text: str) -> str:
|
451
|
+
text = str(text)
|
452
|
+
text = html.unescape(text)
|
453
|
+
if self.strip_tashkeel:
|
454
|
+
text = araby.strip_tashkeel(text)
|
455
|
+
if self.strip_tatweel:
|
456
|
+
text = araby.strip_tatweel(text)
|
457
|
+
|
458
|
+
if self.replace_urls_emails_mentions:
|
459
|
+
# replace all possible URLs
|
460
|
+
for reg in url_regexes:
|
461
|
+
text = re.sub(reg, " [رابط] ", text)
|
462
|
+
# REplace Emails with [بريد]
|
463
|
+
for reg in email_regexes:
|
464
|
+
text = re.sub(reg, " [بريد] ", text)
|
465
|
+
# replace mentions with [مستخدم]
|
466
|
+
text = re.sub(user_mention_regex, " [مستخدم] ", text)
|
467
|
+
|
468
|
+
if self.remove_html_markup:
|
469
|
+
# remove html line breaks
|
470
|
+
text = re.sub("<br />", " ", text)
|
471
|
+
# remove html markup
|
472
|
+
text = re.sub("</?[^>]+>", " ", text)
|
473
|
+
|
474
|
+
if self.map_hindi_numbers_to_arabic:
|
475
|
+
text = text.translate(hindi_to_arabic_map)
|
476
|
+
|
477
|
+
# remove repeated characters >2
|
478
|
+
if self.remove_non_digit_repetition:
|
479
|
+
text = self._remove_non_digit_repetition(text)
|
480
|
+
|
481
|
+
# insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
|
482
|
+
if self.insert_white_spaces:
|
483
|
+
text = re.sub(
|
484
|
+
"([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
|
485
|
+
r" \1 ",
|
486
|
+
text,
|
487
|
+
)
|
488
|
+
|
489
|
+
# insert whitespace between words and numbers or numbers and words
|
490
|
+
text = re.sub(
|
491
|
+
"(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text
|
492
|
+
)
|
493
|
+
text = re.sub(
|
494
|
+
"([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text
|
495
|
+
)
|
496
|
+
|
497
|
+
if self.replace_slash_with_dash:
|
498
|
+
text = text.replace("/", "-")
|
499
|
+
|
500
|
+
# remove unwanted characters
|
501
|
+
if self.keep_emojis:
|
502
|
+
emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
|
503
|
+
rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
|
504
|
+
text = re.sub(rejected_chars_regex2, " ", text)
|
505
|
+
else:
|
506
|
+
text = re.sub(rejected_chars_regex, " ", text)
|
507
|
+
|
508
|
+
# remove extra spaces
|
509
|
+
text = " ".join(text.replace("\uFE0F", "").split())
|
510
|
+
|
511
|
+
if (
|
512
|
+
self.model_name == "bert-base-arabertv2"
|
513
|
+
or self.model_name == "bert-large-arabertv2"
|
514
|
+
):
|
515
|
+
if self.keep_emojis:
|
516
|
+
new_text = []
|
517
|
+
for word in text.split():
|
518
|
+
if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
|
519
|
+
new_text.append(word)
|
520
|
+
else:
|
521
|
+
new_text.append(self.farasa_segmenter.segment(word))
|
522
|
+
text = " ".join(new_text)
|
523
|
+
else:
|
524
|
+
text = self.farasa_segmenter.segment(text)
|
525
|
+
return self._farasa_segment(text)
|
526
|
+
|
527
|
+
# ALl the other models dont require Farasa Segmentation
|
528
|
+
return text
|
529
|
+
|
530
|
+
def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str:
|
531
|
+
"""
|
532
|
+
AraBERTv1 preprocessing Function
|
533
|
+
"""
|
534
|
+
text = str(text)
|
535
|
+
if self.strip_tashkeel:
|
536
|
+
text = araby.strip_tashkeel(text)
|
537
|
+
|
538
|
+
text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
|
539
|
+
text = re.sub("ـ", "", text)
|
540
|
+
text = re.sub("[«»]", ' " ', text)
|
541
|
+
|
542
|
+
if self.replace_urls_emails_mentions:
|
543
|
+
# replace the [رابط] token with space if you want to clean links
|
544
|
+
text = re.sub(regex_url_step1, "[رابط]", text)
|
545
|
+
text = re.sub(regex_url_step2, "[رابط]", text)
|
546
|
+
text = re.sub(regex_url, "[رابط]", text)
|
547
|
+
text = re.sub(regex_email, "[بريد]", text)
|
548
|
+
text = re.sub(regex_mention, "[مستخدم]", text)
|
549
|
+
text = re.sub("…", r"\.", text).strip()
|
550
|
+
text = self._remove_redundant_punct(text)
|
551
|
+
|
552
|
+
if self.replace_urls_emails_mentions:
|
553
|
+
text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
|
554
|
+
text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
|
555
|
+
text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text)
|
556
|
+
|
557
|
+
if self.remove_non_digit_repetition:
|
558
|
+
text = self._remove_non_digit_repetition(text)
|
559
|
+
|
560
|
+
if self.insert_white_spaces:
|
561
|
+
text = re.sub(
|
562
|
+
"([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
|
563
|
+
r" \1 ",
|
564
|
+
text,
|
565
|
+
)
|
566
|
+
if do_farasa_tokenization:
|
567
|
+
text = self._tokenize_arabic_words_farasa(text)
|
568
|
+
|
569
|
+
text = " ".join(text.split())
|
570
|
+
|
571
|
+
return text
|
572
|
+
|
573
|
+
def _farasa_segment(self, text: str) -> str:
|
574
|
+
line_farasa = text.split()
|
575
|
+
segmented_line = []
|
576
|
+
for index, word in enumerate(line_farasa):
|
577
|
+
if word in ["[", "]"]:
|
578
|
+
continue
|
579
|
+
if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
|
580
|
+
"[",
|
581
|
+
"]",
|
582
|
+
]:
|
583
|
+
segmented_line.append("[" + word + "]")
|
584
|
+
continue
|
585
|
+
if "+" not in word:
|
586
|
+
segmented_line.append(word)
|
587
|
+
continue
|
588
|
+
segmented_word = self._split_farasa_output(word)
|
589
|
+
segmented_line.extend(segmented_word)
|
590
|
+
|
591
|
+
return " ".join(segmented_line)
|
592
|
+
|
593
|
+
def _split_farasa_output(self, word: str) -> str:
|
594
|
+
segmented_word = []
|
595
|
+
temp_token = ""
|
596
|
+
for i, c in enumerate(word):
|
597
|
+
if c == "+":
|
598
|
+
# if the token is KAF, it could be a suffix or prefix
|
599
|
+
if temp_token == "ك":
|
600
|
+
# if we are at the second token, then KAF is surely a prefix
|
601
|
+
if i == 1:
|
602
|
+
segmented_word.append(temp_token + "+")
|
603
|
+
temp_token = ""
|
604
|
+
# If the KAF token is between 2 tokens
|
605
|
+
elif word[i - 2] == "+":
|
606
|
+
# if the previous token is prefix, then this KAF must be a prefix
|
607
|
+
if segmented_word[-1][-1] == "+":
|
608
|
+
segmented_word.append(temp_token + "+")
|
609
|
+
temp_token = ""
|
610
|
+
# else it is a suffix, this KAF could not be a second suffix
|
611
|
+
else:
|
612
|
+
segmented_word.append("+" + temp_token)
|
613
|
+
temp_token = ""
|
614
|
+
# if Kaf is at the end, this is handled with the statement after the loop
|
615
|
+
elif temp_token in prefix_list:
|
616
|
+
segmented_word.append(temp_token + "+")
|
617
|
+
temp_token = ""
|
618
|
+
elif temp_token in suffix_list:
|
619
|
+
segmented_word.append("+" + temp_token)
|
620
|
+
temp_token = ""
|
621
|
+
else:
|
622
|
+
segmented_word.append(temp_token)
|
623
|
+
temp_token = ""
|
624
|
+
continue
|
625
|
+
temp_token += c
|
626
|
+
if temp_token != "":
|
627
|
+
if temp_token in suffix_list:
|
628
|
+
segmented_word.append("+" + temp_token)
|
629
|
+
else:
|
630
|
+
segmented_word.append(temp_token)
|
631
|
+
return segmented_word
|
632
|
+
|
633
|
+
def _tokenize_arabic_words_farasa(self, line_input: str) -> str:
|
634
|
+
|
635
|
+
if self.keep_emojis:
|
636
|
+
# insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
|
637
|
+
line_farasa = []
|
638
|
+
for word in line_input.split():
|
639
|
+
if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
|
640
|
+
line_farasa.append(word)
|
641
|
+
else:
|
642
|
+
line_farasa.append(self.farasa_segmenter.segment(word))
|
643
|
+
else:
|
644
|
+
line_farasa = self.farasa_segmenter.segment(line_input).split()
|
645
|
+
|
646
|
+
segmented_line = []
|
647
|
+
for index, word in enumerate(line_farasa):
|
648
|
+
if word in ["[", "]"]:
|
649
|
+
continue
|
650
|
+
if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
|
651
|
+
"[",
|
652
|
+
"]",
|
653
|
+
]:
|
654
|
+
segmented_line.append("[" + word + "]")
|
655
|
+
continue
|
656
|
+
segmented_word = []
|
657
|
+
for token in word.split("+"):
|
658
|
+
if token in prefix_list:
|
659
|
+
segmented_word.append(token + "+")
|
660
|
+
elif token in suffix_list:
|
661
|
+
segmented_word.append("+" + token)
|
662
|
+
else:
|
663
|
+
segmented_word.append(token)
|
664
|
+
segmented_line.extend(segmented_word)
|
665
|
+
return " ".join(segmented_line)
|
666
|
+
|
667
|
+
def _remove_non_digit_repetition(self, text: str) -> str:
|
668
|
+
"""
|
669
|
+
:param text: the input text to remove elongation
|
670
|
+
:return: delongated text
|
671
|
+
"""
|
672
|
+
# loop over the number of times the regex matched the text
|
673
|
+
# OLD
|
674
|
+
# for index_ in range(len(re.findall(regex_tatweel, text))):
|
675
|
+
# elongation = re.search(regex_tatweel, text)
|
676
|
+
# if elongation:
|
677
|
+
# elongation_pattern = elongation.group()
|
678
|
+
# elongation_replacement = elongation_pattern[0]
|
679
|
+
# elongation_pattern = re.escape(elongation_pattern)
|
680
|
+
# text = re.sub(
|
681
|
+
# elongation_pattern, elongation_replacement, text, flags=re.MULTILINE
|
682
|
+
# )
|
683
|
+
# else:
|
684
|
+
# break
|
685
|
+
|
686
|
+
# New
|
687
|
+
text = multiple_char_pattern.sub(r"\1\1", text)
|
688
|
+
return text
|
689
|
+
|
690
|
+
def _remove_redundant_punct(self, text: str) -> str:
|
691
|
+
text_ = text
|
692
|
+
result = re.search(redundant_punct_pattern, text)
|
693
|
+
dif = 0
|
694
|
+
while result:
|
695
|
+
sub = result.group()
|
696
|
+
sub = sorted(set(sub), key=sub.index)
|
697
|
+
sub = " " + "".join(list(sub)) + " "
|
698
|
+
text = "".join(
|
699
|
+
(text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :])
|
700
|
+
)
|
701
|
+
text_ = "".join(
|
702
|
+
(text_[: result.span()[0]], text_[result.span()[1] :])
|
703
|
+
).strip()
|
704
|
+
dif = abs(len(text) - len(text_))
|
705
|
+
result = re.search(redundant_punct_pattern, text_)
|
706
|
+
text = re.sub(r"\s+", " ", text)
|
707
|
+
return text.strip()
|
708
|
+
|
709
|
+
|
710
|
+
prefix_list = [
|
711
|
+
"ال",
|
712
|
+
"و",
|
713
|
+
"ف",
|
714
|
+
"ب",
|
715
|
+
"ك",
|
716
|
+
"ل",
|
717
|
+
"لل",
|
718
|
+
"\u0627\u0644",
|
719
|
+
"\u0648",
|
720
|
+
"\u0641",
|
721
|
+
"\u0628",
|
722
|
+
"\u0643",
|
723
|
+
"\u0644",
|
724
|
+
"\u0644\u0644",
|
725
|
+
"س",
|
726
|
+
]
|
727
|
+
suffix_list = [
|
728
|
+
"ه",
|
729
|
+
"ها",
|
730
|
+
"ك",
|
731
|
+
"ي",
|
732
|
+
"هما",
|
733
|
+
"كما",
|
734
|
+
"نا",
|
735
|
+
"كم",
|
736
|
+
"هم",
|
737
|
+
"هن",
|
738
|
+
"كن",
|
739
|
+
"ا",
|
740
|
+
"ان",
|
741
|
+
"ين",
|
742
|
+
"ون",
|
743
|
+
"وا",
|
744
|
+
"ات",
|
745
|
+
"ت",
|
746
|
+
"ن",
|
747
|
+
"ة",
|
748
|
+
"\u0647",
|
749
|
+
"\u0647\u0627",
|
750
|
+
"\u0643",
|
751
|
+
"\u064a",
|
752
|
+
"\u0647\u0645\u0627",
|
753
|
+
"\u0643\u0645\u0627",
|
754
|
+
"\u0646\u0627",
|
755
|
+
"\u0643\u0645",
|
756
|
+
"\u0647\u0645",
|
757
|
+
"\u0647\u0646",
|
758
|
+
"\u0643\u0646",
|
759
|
+
"\u0627",
|
760
|
+
"\u0627\u0646",
|
761
|
+
"\u064a\u0646",
|
762
|
+
"\u0648\u0646",
|
763
|
+
"\u0648\u0627",
|
764
|
+
"\u0627\u062a",
|
765
|
+
"\u062a",
|
766
|
+
"\u0646",
|
767
|
+
"\u0629",
|
768
|
+
]
|
769
|
+
other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"]
|
770
|
+
|
771
|
+
# the never_split list is ussed with the transformers library
|
772
|
+
prefix_symbols = [x + "+" for x in prefix_list]
|
773
|
+
suffix_symblos = ["+" + x for x in suffix_list]
|
774
|
+
never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens))
|
775
|
+
|
776
|
+
url_regexes = [
|
777
|
+
r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
|
778
|
+
r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS",
|
779
|
+
r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+",
|
780
|
+
r"www[a-zA-Z0-9_\-?=%&/.~]+",
|
781
|
+
r"[a-zA-Z]+\.com",
|
782
|
+
r"(?=http)[^\s]+",
|
783
|
+
r"(?=www)[^\s]+",
|
784
|
+
r"://",
|
785
|
+
]
|
786
|
+
user_mention_regex = r"@[\w\d]+"
|
787
|
+
email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"]
|
788
|
+
redundant_punct_pattern = (
|
789
|
+
r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})"
|
790
|
+
)
|
791
|
+
|
792
|
+
regex_tatweel = r"(\D)\1{2,}"
|
793
|
+
multiple_char_pattern = re.compile(r"(\D)\1{2,}", re.DOTALL)
|
794
|
+
|
795
|
+
rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]"
|
796
|
+
rejected_chars_regexv2 = r"[^0-9\u0621-\u063A\u0641-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/]"
|
797
|
+
|
798
|
+
regex_url_step1 = r"(?=http)[^\s]+"
|
799
|
+
regex_url_step2 = r"(?=www)[^\s]+"
|
800
|
+
regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
|
801
|
+
regex_mention = r"@[\w\d]+"
|
802
|
+
regex_email = r"\S+@\S+"
|
803
|
+
|
804
|
+
chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘"
|
805
|
+
chars_regexv2 = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/"
|
806
|
+
|
807
|
+
white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"'
|
808
|
+
white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'"
|
809
|
+
white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`"
|
810
|
+
white_spaced_em_dash = r"\—\s+([^—]+)\s+\—"
|
811
|
+
|
812
|
+
left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])"
|
813
|
+
right_spaced_chars = r"([\[\(\{“«‘*\~]) "
|
814
|
+
left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) "
|
815
|
+
|
816
|
+
hindi_nums = "٠١٢٣٤٥٦٧٨٩"
|
817
|
+
arabic_nums = "0123456789"
|
818
|
+
hindi_to_arabic_map = str.maketrans(hindi_nums, arabic_nums)
|