SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,818 @@
1
+ import html
2
+ import logging
3
+ import re
4
+ from typing import List
5
+
6
+ import pyarabic.araby as araby
7
+
8
+ ACCEPTED_MODELS = [
9
+ "bert-base-arabertv01",
10
+ "bert-base-arabert",
11
+ "bert-base-arabertv02",
12
+ "bert-base-arabertv2",
13
+ "bert-large-arabertv02",
14
+ "bert-large-arabertv2",
15
+ "araelectra-base",
16
+ "araelectra-base-discriminator",
17
+ "araelectra-base-generator",
18
+ "araelectra-base-artydiqa",
19
+ "aragpt2-base",
20
+ "aragpt2-medium",
21
+ "aragpt2-large",
22
+ "aragpt2-mega",
23
+ ]
24
+
25
+ SEGMENTED_MODELS = [
26
+ "bert-base-arabert",
27
+ "bert-base-arabertv2",
28
+ "bert-large-arabertv2",
29
+ ]
30
+
31
+ SECOND_GEN_MODELS = [
32
+ "bert-base-arabertv02",
33
+ "bert-base-arabertv2",
34
+ "bert-large-arabertv02",
35
+ "bert-large-arabertv2",
36
+ "araelectra-base",
37
+ "araelectra-base-discriminator",
38
+ "araelectra-base-generator",
39
+ "araelectra-base-artydiqa",
40
+ "aragpt2-base",
41
+ "aragpt2-medium",
42
+ "aragpt2-large",
43
+ "aragpt2-mega",
44
+ ]
45
+
46
+
47
+ class ArabertPreprocessor:
48
+ """
49
+ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
50
+ It also can unprocess the text ouput of the generated text
51
+
52
+ Args:
53
+
54
+ model_name (:obj:`str`): model name from the HuggingFace Models page without
55
+ the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found.
56
+ Current accepted models are:
57
+
58
+ - "bert-base-arabertv01": No farasa segmentation.
59
+ - "bert-base-arabert": with farasa segmentation.
60
+ - "bert-base-arabertv02": No farasas egmentation.
61
+ - "bert-base-arabertv2": with farasa segmentation.
62
+ - "bert-large-arabertv02": No farasas egmentation.
63
+ - "bert-large-arabertv2": with farasa segmentation.
64
+ - "araelectra-base": No farasa segmentation.
65
+ - "araelectra-base-discriminator": No farasa segmentation.
66
+ - "araelectra-base-generator": No farasa segmentation.
67
+ - "aragpt2-base": No farasa segmentation.
68
+ - "aragpt2-medium": No farasa segmentation.
69
+ - "aragpt2-large": No farasa segmentation.
70
+ - "aragpt2-mega": No farasa segmentation.
71
+
72
+
73
+ keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing.
74
+
75
+ remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts,
76
+ should be set to False when preprocessing TyDi QA.
77
+
78
+ replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls
79
+ and mentions by special tokens.
80
+
81
+ strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA,
82
+ KASRA, SUKUN, SHADDA).
83
+
84
+ strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'.
85
+
86
+ insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits
87
+ or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace
88
+ between words and numbers or numbers and words.
89
+
90
+ remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with
91
+ 2 of this character.
92
+
93
+ replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02,
94
+ AraELECTRA and AraGPT2.
95
+ Set to False to force disable, and True to force enable. Replaces the "/" with "-",
96
+ since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary.
97
+
98
+ map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
99
+ AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable.
100
+ Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995".
101
+ This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation),
102
+ and fixes the issue of caused by a bug when inserting white spaces.
103
+
104
+ apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
105
+ AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable.
106
+
107
+
108
+
109
+ Returns:
110
+
111
+ ArabertPreprocessor: A preprocessor instance
112
+
113
+ Example:
114
+
115
+ from preprocess import ArabertPreprocessor
116
+
117
+ arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")
118
+
119
+ arabert_prep.preprocess("SOME ARABIC TEXT")
120
+ """
121
+
122
+ def __init__(
123
+ self,
124
+ model_name: str,
125
+ keep_emojis: bool = False,
126
+ remove_html_markup: bool = True,
127
+ replace_urls_emails_mentions: bool = True,
128
+ strip_tashkeel: bool = True,
129
+ strip_tatweel: bool = True,
130
+ insert_white_spaces: bool = True,
131
+ remove_non_digit_repetition: bool = True,
132
+ replace_slash_with_dash: bool = None,
133
+ map_hindi_numbers_to_arabic: bool = None,
134
+ apply_farasa_segmentation: bool = None,
135
+ ):
136
+ """
137
+ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
138
+ It also can unprocess the text ouput of the generated text
139
+
140
+ Args:
141
+
142
+ model_name (:obj:`str`): model name from the HuggingFace Models page without
143
+ the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found.
144
+ Current accepted models are:
145
+
146
+ - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
147
+ - :obj:`"bert-base-arabert"`: with farasa segmentation.
148
+ - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
149
+ - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
150
+ - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
151
+ - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
152
+ - :obj:`"araelectra-base"`: No farasa segmentation.
153
+ - :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
154
+ - :obj:`"araelectra-base-generator"`: No farasa segmentation.
155
+ - :obj:`"aragpt2-base"`: No farasa segmentation.
156
+ - :obj:`"aragpt2-medium"`: No farasa segmentation.
157
+ - :obj:`"aragpt2-large"`: No farasa segmentation.
158
+ - :obj:`"aragpt2-mega"`: No farasa segmentation.
159
+
160
+ keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing.
161
+
162
+ remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts,
163
+ should be set to False when preprocessing TyDi QA.
164
+
165
+ replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls
166
+ and mentions by special tokens.
167
+
168
+ strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA,
169
+ KASRA, SUKUN, SHADDA).
170
+
171
+ strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'.
172
+
173
+ insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits
174
+ or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace
175
+ between words and numbers or numbers and words.
176
+
177
+ remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with
178
+ 2 of this character.
179
+
180
+ replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02,
181
+ AraELECTRA and AraGPT2.
182
+ Set to False to force disable, and True to force enable. Replaces the "/" with "-",
183
+ since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary.
184
+
185
+ map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
186
+ AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable.
187
+ Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995".
188
+ This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation),
189
+ and fixes the issue of caused by a bug when inserting white spaces.
190
+
191
+ apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
192
+ AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable.
193
+
194
+ Returns:
195
+
196
+ ArabertPreprocessor: A preprocessor instance
197
+
198
+ Example:
199
+
200
+ from preprocess import ArabertPreprocessor
201
+
202
+ arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")
203
+
204
+ arabert_prep.preprocess("SOME ARABIC TEXT")
205
+
206
+ """
207
+ model_name = model_name.replace("aubmindlab/", "").replace("wissamantoun/", "")
208
+
209
+ if model_name not in ACCEPTED_MODELS:
210
+ logging.warning(
211
+ """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor"""
212
+ )
213
+ self.model_name = "bert-base-arabertv02"
214
+ else:
215
+ self.model_name = model_name
216
+
217
+ if apply_farasa_segmentation is None:
218
+ if self.model_name in SEGMENTED_MODELS:
219
+ self.apply_farasa_segmentation = True
220
+ else:
221
+ self.apply_farasa_segmentation = False
222
+ else:
223
+ if apply_farasa_segmentation == False and self.apply_farasa_segmentation:
224
+ logging.warning(
225
+ "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!"
226
+ )
227
+
228
+ self.apply_farasa_segmentation = apply_farasa_segmentation
229
+
230
+ if self.apply_farasa_segmentation:
231
+ try:
232
+ from farasa.segmenter import FarasaSegmenter
233
+
234
+ self.farasa_segmenter = FarasaSegmenter(interactive=True)
235
+ except ModuleNotFoundError:
236
+ logging.error(
237
+ "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
238
+ )
239
+
240
+ self.keep_emojis = keep_emojis
241
+ if self.keep_emojis:
242
+ import emoji
243
+
244
+ self.emoji = emoji
245
+ if self.apply_farasa_segmentation:
246
+ logging.warning(
247
+ "Keeping tweets with Farasa Segmentation is 10 times slower"
248
+ )
249
+
250
+ self.remove_html_markup = remove_html_markup
251
+ self.replace_urls_emails_mentions = replace_urls_emails_mentions
252
+ self.strip_tashkeel = strip_tashkeel
253
+ self.strip_tatweel = strip_tatweel
254
+ self.insert_white_spaces = insert_white_spaces
255
+ self.remove_non_digit_repetition = remove_non_digit_repetition
256
+
257
+ if replace_slash_with_dash is None:
258
+ if self.model_name in SECOND_GEN_MODELS:
259
+ self.replace_slash_with_dash = True
260
+ else:
261
+ self.replace_slash_with_dash = False
262
+ else:
263
+ self.replace_slash_with_dash = replace_slash_with_dash
264
+
265
+ if map_hindi_numbers_to_arabic is None:
266
+ if self.model_name in SECOND_GEN_MODELS:
267
+ self.map_hindi_numbers_to_arabic = True
268
+ else:
269
+ self.map_hindi_numbers_to_arabic = False
270
+ else:
271
+ self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic
272
+
273
+ def preprocess(self, text: str) -> str:
274
+ """
275
+ Preprocess takes an input text line an applies the same preprocessing used in AraBERT
276
+ pretraining, or according to settings
277
+
278
+ Args:
279
+
280
+ text (:obj:`str`): inout text string
281
+
282
+ Returns:
283
+
284
+ string: A preprocessed string depending on which model was selected
285
+ """
286
+ if (
287
+ self.model_name == "bert-base-arabert"
288
+ or self.model_name == "bert-base-arabertv01"
289
+ ):
290
+ return self._preprocess_v1(
291
+ text,
292
+ do_farasa_tokenization=self.apply_farasa_segmentation,
293
+ )
294
+
295
+ if self.model_name in SECOND_GEN_MODELS:
296
+ return self._preprocess_v2(text)
297
+
298
+ return self._preprocess_v3(text)
299
+
300
+ def unpreprocess(self, text: str, desegment: bool = True) -> str:
301
+ """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
302
+ The objective is to make the generated text of any model appear natural and not preprocessed.
303
+
304
+ Args:
305
+ text (:obj:`str`): input text to be un-preprocessed
306
+ desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..
307
+
308
+ Returns:
309
+ str: The unpreprocessed (and possibly Farasa-desegmented) text.
310
+ """
311
+
312
+ if self.apply_farasa_segmentation and desegment:
313
+ text = self.desegment(text)
314
+
315
+ # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
316
+ # https://stackoverflow.com/a/53436792/5381220
317
+ text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
318
+ text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
319
+ text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
320
+ text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)
321
+
322
+ # during generation, sometimes the models don't put a space after the dot, this handles it
323
+ text = text.replace(".", " . ")
324
+ text = " ".join(text.split())
325
+
326
+ # handle decimals
327
+ text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
328
+ text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)
329
+
330
+ text = re.sub(left_and_right_spaced_chars, r"\1", text)
331
+ text = re.sub(left_spaced_chars, r"\1", text)
332
+ text = re.sub(right_spaced_chars, r"\1", text)
333
+
334
+ return text
335
+
336
+ def desegment(self, text: str) -> str:
337
+ """
338
+ Use this function if sentence tokenization was done using
339
+ `from arabert.preprocess_arabert import preprocess` with Farasa enabled
340
+ AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
341
+ and after before the '+' for suffixes
342
+
343
+ Example:
344
+ >>> desegment('ال+ دراس +ات')
345
+ الدراسات
346
+ """
347
+ text = text.replace("+ ", "+")
348
+ text = text.replace(" +", "+")
349
+ text = " ".join([self._desegmentword(word) for word in text.split(" ")])
350
+ return text
351
+
352
+ def _desegmentword(self, orig_word: str) -> str:
353
+ """
354
+ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs
355
+
356
+ Example:
357
+ >>> _desegmentword("ال+يومي+ة")
358
+ اليومية
359
+ """
360
+ word = orig_word.replace("ل+ال+", "لل")
361
+ if "ال+ال" not in orig_word:
362
+ word = word.replace("ل+ال", "لل")
363
+ word = word.replace("+", "")
364
+ word = word.replace("للل", "لل")
365
+ return word
366
+
367
+ def _preprocess_v3(self, text: str) -> str:
368
+ text = str(text)
369
+ text = html.unescape(text)
370
+ if self.strip_tashkeel:
371
+ text = araby.strip_tashkeel(text)
372
+ if self.strip_tatweel:
373
+ text = araby.strip_tatweel(text)
374
+
375
+ if self.replace_urls_emails_mentions:
376
+ # replace all possible URLs
377
+ for reg in url_regexes:
378
+ text = re.sub(reg, " [رابط] ", text)
379
+ # REplace Emails with [بريد]
380
+ for reg in email_regexes:
381
+ text = re.sub(reg, " [بريد] ", text)
382
+ # replace mentions with [مستخدم]
383
+ text = re.sub(user_mention_regex, " [مستخدم] ", text)
384
+
385
+ if self.remove_html_markup:
386
+ # remove html line breaks
387
+ text = re.sub("<br />", " ", text)
388
+ # remove html markup
389
+ text = re.sub("</?[^>]+>", " ", text)
390
+
391
+ if self.map_hindi_numbers_to_arabic:
392
+ text = text.translate(hindi_to_arabic_map)
393
+
394
+ # remove repeated characters >2
395
+ if self.remove_non_digit_repetition:
396
+ text = self._remove_non_digit_repetition(text)
397
+
398
+ # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
399
+ if self.insert_white_spaces:
400
+ text = re.sub(
401
+ "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])",
402
+ r" \1 ",
403
+ text,
404
+ )
405
+
406
+ # re-fix brackets
407
+ text = text.replace("[ رابط ]", "[رابط]")
408
+ text = text.replace("[ بريد ]", "[بريد]")
409
+ text = text.replace("[ مستخدم ]", "[مستخدم]")
410
+
411
+ # insert whitespace between words and numbers or numbers and words
412
+ text = re.sub(
413
+ "(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)",
414
+ r" \1 \2 ",
415
+ text,
416
+ )
417
+ text = re.sub(
418
+ "([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)",
419
+ r" \1 \2 ",
420
+ text,
421
+ )
422
+
423
+ # remove unwanted characters
424
+ if self.keep_emojis:
425
+ emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
426
+ rejected_chars_regex2 = "[^%s%s]" % (chars_regexv2, emoji_regex)
427
+ text = re.sub(rejected_chars_regex2, " ", text)
428
+ else:
429
+ text = re.sub(rejected_chars_regexv2, " ", text)
430
+
431
+ # remove extra spaces
432
+ text = " ".join(text.replace("\uFE0F", "").split())
433
+
434
+ if self.apply_farasa_segmentation:
435
+ if self.keep_emojis:
436
+ new_text = []
437
+ for word in text.split():
438
+ if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
439
+ new_text.append(word)
440
+ else:
441
+ new_text.append(self.farasa_segmenter.segment(word))
442
+ text = " ".join(new_text)
443
+ else:
444
+ text = self.farasa_segmenter.segment(text)
445
+ return self._farasa_segment(text)
446
+
447
+ # ALl the other models dont require Farasa Segmentation
448
+ return text
449
+
450
+ def _preprocess_v2(self, text: str) -> str:
451
+ text = str(text)
452
+ text = html.unescape(text)
453
+ if self.strip_tashkeel:
454
+ text = araby.strip_tashkeel(text)
455
+ if self.strip_tatweel:
456
+ text = araby.strip_tatweel(text)
457
+
458
+ if self.replace_urls_emails_mentions:
459
+ # replace all possible URLs
460
+ for reg in url_regexes:
461
+ text = re.sub(reg, " [رابط] ", text)
462
+ # REplace Emails with [بريد]
463
+ for reg in email_regexes:
464
+ text = re.sub(reg, " [بريد] ", text)
465
+ # replace mentions with [مستخدم]
466
+ text = re.sub(user_mention_regex, " [مستخدم] ", text)
467
+
468
+ if self.remove_html_markup:
469
+ # remove html line breaks
470
+ text = re.sub("<br />", " ", text)
471
+ # remove html markup
472
+ text = re.sub("</?[^>]+>", " ", text)
473
+
474
+ if self.map_hindi_numbers_to_arabic:
475
+ text = text.translate(hindi_to_arabic_map)
476
+
477
+ # remove repeated characters >2
478
+ if self.remove_non_digit_repetition:
479
+ text = self._remove_non_digit_repetition(text)
480
+
481
+ # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
482
+ if self.insert_white_spaces:
483
+ text = re.sub(
484
+ "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
485
+ r" \1 ",
486
+ text,
487
+ )
488
+
489
+ # insert whitespace between words and numbers or numbers and words
490
+ text = re.sub(
491
+ "(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text
492
+ )
493
+ text = re.sub(
494
+ "([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text
495
+ )
496
+
497
+ if self.replace_slash_with_dash:
498
+ text = text.replace("/", "-")
499
+
500
+ # remove unwanted characters
501
+ if self.keep_emojis:
502
+ emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
503
+ rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
504
+ text = re.sub(rejected_chars_regex2, " ", text)
505
+ else:
506
+ text = re.sub(rejected_chars_regex, " ", text)
507
+
508
+ # remove extra spaces
509
+ text = " ".join(text.replace("\uFE0F", "").split())
510
+
511
+ if (
512
+ self.model_name == "bert-base-arabertv2"
513
+ or self.model_name == "bert-large-arabertv2"
514
+ ):
515
+ if self.keep_emojis:
516
+ new_text = []
517
+ for word in text.split():
518
+ if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
519
+ new_text.append(word)
520
+ else:
521
+ new_text.append(self.farasa_segmenter.segment(word))
522
+ text = " ".join(new_text)
523
+ else:
524
+ text = self.farasa_segmenter.segment(text)
525
+ return self._farasa_segment(text)
526
+
527
+ # ALl the other models dont require Farasa Segmentation
528
+ return text
529
+
530
+ def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str:
531
+ """
532
+ AraBERTv1 preprocessing Function
533
+ """
534
+ text = str(text)
535
+ if self.strip_tashkeel:
536
+ text = araby.strip_tashkeel(text)
537
+
538
+ text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
539
+ text = re.sub("ـ", "", text)
540
+ text = re.sub("[«»]", ' " ', text)
541
+
542
+ if self.replace_urls_emails_mentions:
543
+ # replace the [رابط] token with space if you want to clean links
544
+ text = re.sub(regex_url_step1, "[رابط]", text)
545
+ text = re.sub(regex_url_step2, "[رابط]", text)
546
+ text = re.sub(regex_url, "[رابط]", text)
547
+ text = re.sub(regex_email, "[بريد]", text)
548
+ text = re.sub(regex_mention, "[مستخدم]", text)
549
+ text = re.sub("…", r"\.", text).strip()
550
+ text = self._remove_redundant_punct(text)
551
+
552
+ if self.replace_urls_emails_mentions:
553
+ text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
554
+ text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
555
+ text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text)
556
+
557
+ if self.remove_non_digit_repetition:
558
+ text = self._remove_non_digit_repetition(text)
559
+
560
+ if self.insert_white_spaces:
561
+ text = re.sub(
562
+ "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
563
+ r" \1 ",
564
+ text,
565
+ )
566
+ if do_farasa_tokenization:
567
+ text = self._tokenize_arabic_words_farasa(text)
568
+
569
+ text = " ".join(text.split())
570
+
571
+ return text
572
+
573
+ def _farasa_segment(self, text: str) -> str:
574
+ line_farasa = text.split()
575
+ segmented_line = []
576
+ for index, word in enumerate(line_farasa):
577
+ if word in ["[", "]"]:
578
+ continue
579
+ if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
580
+ "[",
581
+ "]",
582
+ ]:
583
+ segmented_line.append("[" + word + "]")
584
+ continue
585
+ if "+" not in word:
586
+ segmented_line.append(word)
587
+ continue
588
+ segmented_word = self._split_farasa_output(word)
589
+ segmented_line.extend(segmented_word)
590
+
591
+ return " ".join(segmented_line)
592
+
593
+ def _split_farasa_output(self, word: str) -> str:
594
+ segmented_word = []
595
+ temp_token = ""
596
+ for i, c in enumerate(word):
597
+ if c == "+":
598
+ # if the token is KAF, it could be a suffix or prefix
599
+ if temp_token == "ك":
600
+ # if we are at the second token, then KAF is surely a prefix
601
+ if i == 1:
602
+ segmented_word.append(temp_token + "+")
603
+ temp_token = ""
604
+ # If the KAF token is between 2 tokens
605
+ elif word[i - 2] == "+":
606
+ # if the previous token is prefix, then this KAF must be a prefix
607
+ if segmented_word[-1][-1] == "+":
608
+ segmented_word.append(temp_token + "+")
609
+ temp_token = ""
610
+ # else it is a suffix, this KAF could not be a second suffix
611
+ else:
612
+ segmented_word.append("+" + temp_token)
613
+ temp_token = ""
614
+ # if Kaf is at the end, this is handled with the statement after the loop
615
+ elif temp_token in prefix_list:
616
+ segmented_word.append(temp_token + "+")
617
+ temp_token = ""
618
+ elif temp_token in suffix_list:
619
+ segmented_word.append("+" + temp_token)
620
+ temp_token = ""
621
+ else:
622
+ segmented_word.append(temp_token)
623
+ temp_token = ""
624
+ continue
625
+ temp_token += c
626
+ if temp_token != "":
627
+ if temp_token in suffix_list:
628
+ segmented_word.append("+" + temp_token)
629
+ else:
630
+ segmented_word.append(temp_token)
631
+ return segmented_word
632
+
633
+ def _tokenize_arabic_words_farasa(self, line_input: str) -> str:
634
+
635
+ if self.keep_emojis:
636
+ # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
637
+ line_farasa = []
638
+ for word in line_input.split():
639
+ if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
640
+ line_farasa.append(word)
641
+ else:
642
+ line_farasa.append(self.farasa_segmenter.segment(word))
643
+ else:
644
+ line_farasa = self.farasa_segmenter.segment(line_input).split()
645
+
646
+ segmented_line = []
647
+ for index, word in enumerate(line_farasa):
648
+ if word in ["[", "]"]:
649
+ continue
650
+ if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
651
+ "[",
652
+ "]",
653
+ ]:
654
+ segmented_line.append("[" + word + "]")
655
+ continue
656
+ segmented_word = []
657
+ for token in word.split("+"):
658
+ if token in prefix_list:
659
+ segmented_word.append(token + "+")
660
+ elif token in suffix_list:
661
+ segmented_word.append("+" + token)
662
+ else:
663
+ segmented_word.append(token)
664
+ segmented_line.extend(segmented_word)
665
+ return " ".join(segmented_line)
666
+
667
+ def _remove_non_digit_repetition(self, text: str) -> str:
668
+ """
669
+ :param text: the input text to remove elongation
670
+ :return: delongated text
671
+ """
672
+ # loop over the number of times the regex matched the text
673
+ # OLD
674
+ # for index_ in range(len(re.findall(regex_tatweel, text))):
675
+ # elongation = re.search(regex_tatweel, text)
676
+ # if elongation:
677
+ # elongation_pattern = elongation.group()
678
+ # elongation_replacement = elongation_pattern[0]
679
+ # elongation_pattern = re.escape(elongation_pattern)
680
+ # text = re.sub(
681
+ # elongation_pattern, elongation_replacement, text, flags=re.MULTILINE
682
+ # )
683
+ # else:
684
+ # break
685
+
686
+ # New
687
+ text = multiple_char_pattern.sub(r"\1\1", text)
688
+ return text
689
+
690
+ def _remove_redundant_punct(self, text: str) -> str:
691
+ text_ = text
692
+ result = re.search(redundant_punct_pattern, text)
693
+ dif = 0
694
+ while result:
695
+ sub = result.group()
696
+ sub = sorted(set(sub), key=sub.index)
697
+ sub = " " + "".join(list(sub)) + " "
698
+ text = "".join(
699
+ (text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :])
700
+ )
701
+ text_ = "".join(
702
+ (text_[: result.span()[0]], text_[result.span()[1] :])
703
+ ).strip()
704
+ dif = abs(len(text) - len(text_))
705
+ result = re.search(redundant_punct_pattern, text_)
706
+ text = re.sub(r"\s+", " ", text)
707
+ return text.strip()
708
+
709
+
710
+ prefix_list = [
711
+ "ال",
712
+ "و",
713
+ "ف",
714
+ "ب",
715
+ "ك",
716
+ "ل",
717
+ "لل",
718
+ "\u0627\u0644",
719
+ "\u0648",
720
+ "\u0641",
721
+ "\u0628",
722
+ "\u0643",
723
+ "\u0644",
724
+ "\u0644\u0644",
725
+ "س",
726
+ ]
727
+ suffix_list = [
728
+ "ه",
729
+ "ها",
730
+ "ك",
731
+ "ي",
732
+ "هما",
733
+ "كما",
734
+ "نا",
735
+ "كم",
736
+ "هم",
737
+ "هن",
738
+ "كن",
739
+ "ا",
740
+ "ان",
741
+ "ين",
742
+ "ون",
743
+ "وا",
744
+ "ات",
745
+ "ت",
746
+ "ن",
747
+ "ة",
748
+ "\u0647",
749
+ "\u0647\u0627",
750
+ "\u0643",
751
+ "\u064a",
752
+ "\u0647\u0645\u0627",
753
+ "\u0643\u0645\u0627",
754
+ "\u0646\u0627",
755
+ "\u0643\u0645",
756
+ "\u0647\u0645",
757
+ "\u0647\u0646",
758
+ "\u0643\u0646",
759
+ "\u0627",
760
+ "\u0627\u0646",
761
+ "\u064a\u0646",
762
+ "\u0648\u0646",
763
+ "\u0648\u0627",
764
+ "\u0627\u062a",
765
+ "\u062a",
766
+ "\u0646",
767
+ "\u0629",
768
+ ]
769
+ other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"]
770
+
771
+ # the never_split list is ussed with the transformers library
772
+ prefix_symbols = [x + "+" for x in prefix_list]
773
+ suffix_symblos = ["+" + x for x in suffix_list]
774
+ never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens))
775
+
776
+ url_regexes = [
777
+ r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
778
+ r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS",
779
+ r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+",
780
+ r"www[a-zA-Z0-9_\-?=%&/.~]+",
781
+ r"[a-zA-Z]+\.com",
782
+ r"(?=http)[^\s]+",
783
+ r"(?=www)[^\s]+",
784
+ r"://",
785
+ ]
786
+ user_mention_regex = r"@[\w\d]+"
787
+ email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"]
788
+ redundant_punct_pattern = (
789
+ r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})"
790
+ )
791
+
792
+ regex_tatweel = r"(\D)\1{2,}"
793
+ multiple_char_pattern = re.compile(r"(\D)\1{2,}", re.DOTALL)
794
+
795
+ rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]"
796
+ rejected_chars_regexv2 = r"[^0-9\u0621-\u063A\u0641-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/]"
797
+
798
+ regex_url_step1 = r"(?=http)[^\s]+"
799
+ regex_url_step2 = r"(?=www)[^\s]+"
800
+ regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
801
+ regex_mention = r"@[\w\d]+"
802
+ regex_email = r"\S+@\S+"
803
+
804
+ chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘"
805
+ chars_regexv2 = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/"
806
+
807
+ white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"'
808
+ white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'"
809
+ white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`"
810
+ white_spaced_em_dash = r"\—\s+([^—]+)\s+\—"
811
+
812
+ left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])"
813
+ right_spaced_chars = r"([\[\(\{“«‘*\~]) "
814
+ left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) "
815
+
816
+ hindi_nums = "٠١٢٣٤٥٦٧٨٩"
817
+ arabic_nums = "0123456789"
818
+ hindi_to_arabic_map = str.maketrans(hindi_nums, arabic_nums)