SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,260 @@
1
+ # Scripts used to pre_process and create the data for classifier evaluation
2
+ #%%
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ import sys
7
+ sys.path.append("..")
8
+
9
+ from arabert.preprocess import ArabertPreprocessor
10
+
11
+
12
+ from tqdm import tqdm
13
+
14
+ tqdm.pandas()
15
+
16
+ from tokenization import FullTokenizer
17
+ from run_classifier import input_fn_builder, model_fn_builder
18
+
19
+
20
+ model_name = "bert-base-arabert"
21
+ arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)
22
+
23
+
24
+ class Dataset:
25
+ def __init__(
26
+ self,
27
+ name,
28
+ train,
29
+ test,
30
+ label_list,
31
+ train_InputExamples=None,
32
+ test_InputExamples=None,
33
+ train_features=None,
34
+ test_features=None,
35
+ ):
36
+ self.name = name
37
+ self.train = train
38
+ self.test = test
39
+ self.label_list = label_list
40
+ self.train_InputExamples = train_InputExamples
41
+ self.test_InputExamples = test_InputExamples
42
+ self.train_features = train_features
43
+ self.test_features = test_features
44
+
45
+
46
+ all_datasets = []
47
+ #%%
48
+ # *************HARD************
49
+ df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0)
50
+
51
+ df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only
52
+ # code rating as +ve if > 3, -ve if less, no 3s in dataset
53
+ df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1)
54
+ # rename columns to fit default constructor in fastai
55
+ df_HARD.columns = ["label", "text"]
56
+ df_HARD["text"] = df_HARD["text"].progress_apply(
57
+ lambda x: arabert_prep.preprocess(
58
+ x
59
+ )
60
+ )
61
+ train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
62
+ label_list_HARD = [0, 1]
63
+
64
+ data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
65
+ all_datasets.append(data_Hard)
66
+
67
+ #%%
68
+ # *************ASTD-Unbalanced************
69
+ df_ASTD_UN = pd.read_csv(
70
+ "Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None
71
+ )
72
+
73
+ DATA_COLUMN = "text"
74
+ LABEL_COLUMN = "label"
75
+ df_ASTD_UN.columns = [DATA_COLUMN, LABEL_COLUMN]
76
+
77
+ df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
78
+ lambda x: 0 if (x == "NEG") else x
79
+ )
80
+ df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
81
+ lambda x: 1 if (x == "POS") else x
82
+ )
83
+ df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
84
+ lambda x: 2 if (x == "NEUTRAL") else x
85
+ )
86
+ df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
87
+ lambda x: 3 if (x == "OBJ") else x
88
+ )
89
+ df_ASTD_UN["text"] = df_ASTD_UN["text"].progress_apply(
90
+ lambda x: arabert_prep.preprocess(
91
+ x
92
+ )
93
+ )
94
+ train_ASTD_UN, test_ASTD_UN = train_test_split(
95
+ df_ASTD_UN, test_size=0.2, random_state=42
96
+ )
97
+ label_list_ASTD_UN = [0, 1, 2, 3]
98
+
99
+ data_ASTD_UN = Dataset(
100
+ "ASTD-Unbalanced", train_ASTD_UN, test_ASTD_UN, label_list_ASTD_UN
101
+ )
102
+ all_datasets.append(data_ASTD_UN)
103
+ #%%
104
+ # *************ASTD-Dahou-Balanced************
105
+
106
+ df_ASTD_B = pd.read_csv(
107
+ "Datasets\\Dahou\\data_csv_balanced\\ASTD-balanced-not-linked.csv",
108
+ sep=",",
109
+ header=0,
110
+ )
111
+
112
+ df_ASTD_B.columns = [DATA_COLUMN, LABEL_COLUMN]
113
+
114
+ df_ASTD_B[LABEL_COLUMN] = df_ASTD_B[LABEL_COLUMN].apply(lambda x: 0 if (x == -1) else x)
115
+ df_ASTD_B["text"] = df_ASTD_B["text"].progress_apply(
116
+ lambda x: arabert_prep.preprocess(
117
+ x
118
+ )
119
+ )
120
+ train_ASTD_B, test_ASTD_B = train_test_split(df_ASTD_B, test_size=0.2, random_state=42)
121
+ label_list_ASTD_B = [0, 1]
122
+
123
+ data_ASTD_B = Dataset(
124
+ "ASTD-Dahou-Balanced", train_ASTD_B, test_ASTD_B, label_list_ASTD_B
125
+ )
126
+ all_datasets.append(data_ASTD_B)
127
+
128
+ #%%
129
+ # *************ArSenTD-LEV************
130
+ df_ArSenTD = pd.read_csv(
131
+ "Datasets\\ArSenTD-LEV\\ArSenTD-LEV-processed-no-emojis2.csv", sep=",", header=0
132
+ )
133
+
134
+ df_ArSenTD.columns = [DATA_COLUMN, LABEL_COLUMN]
135
+
136
+ df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
137
+ lambda x: 0 if (x == "very_negative") else x
138
+ )
139
+ df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
140
+ lambda x: 1 if (x == "negative") else x
141
+ )
142
+ df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
143
+ lambda x: 2 if (x == "neutral") else x
144
+ )
145
+ df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
146
+ lambda x: 3 if (x == "positive") else x
147
+ )
148
+ df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
149
+ lambda x: 4 if (x == "very_positive") else x
150
+ )
151
+ df_ArSenTD["text"] = df_ArSenTD["text"].progress_apply(
152
+ lambda x: arabert_prep.preprocess(
153
+ x
154
+ )
155
+ )
156
+ label_list_ArSenTD = [0, 1, 2, 3, 4]
157
+
158
+ train_ArSenTD, test_ArSenTD = train_test_split(
159
+ df_ArSenTD, test_size=0.2, random_state=42
160
+ )
161
+
162
+ data_ArSenTD = Dataset("ArSenTD-LEV", train_ArSenTD, test_ArSenTD, label_list_ArSenTD)
163
+ all_datasets.append(data_ArSenTD)
164
+
165
+ #%%
166
+ # *************AJGT************
167
+ df_AJGT = pd.read_excel("Datasets\\Ajgt\\AJGT.xlsx", header=0)
168
+
169
+ df_AJGT = df_AJGT[["Feed", "Sentiment"]]
170
+ df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]
171
+
172
+ df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
173
+ lambda x: 0 if (x == "Negative") else x
174
+ )
175
+ df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
176
+ lambda x: 1 if (x == "Positive") else x
177
+ )
178
+ df_AJGT["text"] = df_AJGT["text"].progress_apply(
179
+ lambda x: arabert_prep.preprocess(
180
+ x
181
+ )
182
+ )
183
+ train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42)
184
+ label_list_AJGT = [0, 1]
185
+
186
+ data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
187
+ all_datasets.append(data_AJGT)
188
+ #%%
189
+ # *************LABR-UN-Binary************
190
+ from labr import LABR
191
+
192
+ labr_helper = LABR()
193
+
194
+ (d_train, y_train, d_test, y_test) = labr_helper.get_train_test(
195
+ klass="2", balanced="unbalanced"
196
+ )
197
+
198
+ train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
199
+ test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})
200
+
201
+ train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
202
+ lambda x: arabert_prep.preprocess(
203
+ x
204
+ )
205
+ )
206
+ test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
207
+ lambda x: arabert_prep.preprocess(
208
+ x
209
+ )
210
+ )
211
+ label_list_LABR_B_U = [0, 1]
212
+
213
+ data_LABR_B_U = Dataset(
214
+ "LABR-UN-Binary", train_LABR_B_U, test_LABR_B_U, label_list_LABR_B_U
215
+ )
216
+ # all_datasets.append(data_LABR_B_U)
217
+
218
+ #%%
219
+ for data in tqdm(all_datasets):
220
+ # Use the InputExample class from BERT's run_classifier code to create examples from the data
221
+ data.train_InputExamples = data.train.apply(
222
+ lambda x: run_classifier.InputExample(
223
+ guid=None, # Globally unique ID for bookkeeping, unused in this example
224
+ text_a=x[DATA_COLUMN],
225
+ text_b=None,
226
+ label=x[LABEL_COLUMN],
227
+ ),
228
+ axis=1,
229
+ )
230
+
231
+ data.test_InputExamples = data.test.apply(
232
+ lambda x: run_classifier.InputExample(
233
+ guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]
234
+ ),
235
+ axis=1,
236
+ )
237
+ #%%
238
+ # We'll set sequences to be at most 128 tokens long.
239
+ MAX_SEQ_LENGTH = 256
240
+
241
+ VOC_FNAME = "./64000_vocab_sp_70m.txt"
242
+ tokenizer = FullTokenizer(VOC_FNAME)
243
+
244
+ for data in tqdm(all_datasets):
245
+ # Convert our train and test features to InputFeatures that BERT understands.
246
+ data.train_features = run_classifier.convert_examples_to_features(
247
+ data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
248
+ )
249
+ data.test_features = run_classifier.convert_examples_to_features(
250
+ data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
251
+ )
252
+
253
+ # %%
254
+ import pickle
255
+
256
+ with open("all_datasets_64k_farasa_256.pickle", "wb") as fp: # Pickling
257
+ pickle.dump(all_datasets, fp)
258
+
259
+
260
+ # %%