SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
@@ -0,0 +1,260 @@
|
|
1
|
+
# Scripts used to pre_process and create the data for classifier evaluation
|
2
|
+
#%%
|
3
|
+
import pandas as pd
|
4
|
+
from sklearn.model_selection import train_test_split
|
5
|
+
|
6
|
+
import sys
|
7
|
+
sys.path.append("..")
|
8
|
+
|
9
|
+
from arabert.preprocess import ArabertPreprocessor
|
10
|
+
|
11
|
+
|
12
|
+
from tqdm import tqdm
|
13
|
+
|
14
|
+
tqdm.pandas()
|
15
|
+
|
16
|
+
from tokenization import FullTokenizer
|
17
|
+
from run_classifier import input_fn_builder, model_fn_builder
|
18
|
+
|
19
|
+
|
20
|
+
model_name = "bert-base-arabert"
|
21
|
+
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)
|
22
|
+
|
23
|
+
|
24
|
+
class Dataset:
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
name,
|
28
|
+
train,
|
29
|
+
test,
|
30
|
+
label_list,
|
31
|
+
train_InputExamples=None,
|
32
|
+
test_InputExamples=None,
|
33
|
+
train_features=None,
|
34
|
+
test_features=None,
|
35
|
+
):
|
36
|
+
self.name = name
|
37
|
+
self.train = train
|
38
|
+
self.test = test
|
39
|
+
self.label_list = label_list
|
40
|
+
self.train_InputExamples = train_InputExamples
|
41
|
+
self.test_InputExamples = test_InputExamples
|
42
|
+
self.train_features = train_features
|
43
|
+
self.test_features = test_features
|
44
|
+
|
45
|
+
|
46
|
+
all_datasets = []
|
47
|
+
#%%
|
48
|
+
# *************HARD************
|
49
|
+
df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0)
|
50
|
+
|
51
|
+
df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only
|
52
|
+
# code rating as +ve if > 3, -ve if less, no 3s in dataset
|
53
|
+
df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1)
|
54
|
+
# rename columns to fit default constructor in fastai
|
55
|
+
df_HARD.columns = ["label", "text"]
|
56
|
+
df_HARD["text"] = df_HARD["text"].progress_apply(
|
57
|
+
lambda x: arabert_prep.preprocess(
|
58
|
+
x
|
59
|
+
)
|
60
|
+
)
|
61
|
+
train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
|
62
|
+
label_list_HARD = [0, 1]
|
63
|
+
|
64
|
+
data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
|
65
|
+
all_datasets.append(data_Hard)
|
66
|
+
|
67
|
+
#%%
|
68
|
+
# *************ASTD-Unbalanced************
|
69
|
+
df_ASTD_UN = pd.read_csv(
|
70
|
+
"Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None
|
71
|
+
)
|
72
|
+
|
73
|
+
DATA_COLUMN = "text"
|
74
|
+
LABEL_COLUMN = "label"
|
75
|
+
df_ASTD_UN.columns = [DATA_COLUMN, LABEL_COLUMN]
|
76
|
+
|
77
|
+
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
78
|
+
lambda x: 0 if (x == "NEG") else x
|
79
|
+
)
|
80
|
+
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
81
|
+
lambda x: 1 if (x == "POS") else x
|
82
|
+
)
|
83
|
+
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
84
|
+
lambda x: 2 if (x == "NEUTRAL") else x
|
85
|
+
)
|
86
|
+
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
87
|
+
lambda x: 3 if (x == "OBJ") else x
|
88
|
+
)
|
89
|
+
df_ASTD_UN["text"] = df_ASTD_UN["text"].progress_apply(
|
90
|
+
lambda x: arabert_prep.preprocess(
|
91
|
+
x
|
92
|
+
)
|
93
|
+
)
|
94
|
+
train_ASTD_UN, test_ASTD_UN = train_test_split(
|
95
|
+
df_ASTD_UN, test_size=0.2, random_state=42
|
96
|
+
)
|
97
|
+
label_list_ASTD_UN = [0, 1, 2, 3]
|
98
|
+
|
99
|
+
data_ASTD_UN = Dataset(
|
100
|
+
"ASTD-Unbalanced", train_ASTD_UN, test_ASTD_UN, label_list_ASTD_UN
|
101
|
+
)
|
102
|
+
all_datasets.append(data_ASTD_UN)
|
103
|
+
#%%
|
104
|
+
# *************ASTD-Dahou-Balanced************
|
105
|
+
|
106
|
+
df_ASTD_B = pd.read_csv(
|
107
|
+
"Datasets\\Dahou\\data_csv_balanced\\ASTD-balanced-not-linked.csv",
|
108
|
+
sep=",",
|
109
|
+
header=0,
|
110
|
+
)
|
111
|
+
|
112
|
+
df_ASTD_B.columns = [DATA_COLUMN, LABEL_COLUMN]
|
113
|
+
|
114
|
+
df_ASTD_B[LABEL_COLUMN] = df_ASTD_B[LABEL_COLUMN].apply(lambda x: 0 if (x == -1) else x)
|
115
|
+
df_ASTD_B["text"] = df_ASTD_B["text"].progress_apply(
|
116
|
+
lambda x: arabert_prep.preprocess(
|
117
|
+
x
|
118
|
+
)
|
119
|
+
)
|
120
|
+
train_ASTD_B, test_ASTD_B = train_test_split(df_ASTD_B, test_size=0.2, random_state=42)
|
121
|
+
label_list_ASTD_B = [0, 1]
|
122
|
+
|
123
|
+
data_ASTD_B = Dataset(
|
124
|
+
"ASTD-Dahou-Balanced", train_ASTD_B, test_ASTD_B, label_list_ASTD_B
|
125
|
+
)
|
126
|
+
all_datasets.append(data_ASTD_B)
|
127
|
+
|
128
|
+
#%%
|
129
|
+
# *************ArSenTD-LEV************
|
130
|
+
df_ArSenTD = pd.read_csv(
|
131
|
+
"Datasets\\ArSenTD-LEV\\ArSenTD-LEV-processed-no-emojis2.csv", sep=",", header=0
|
132
|
+
)
|
133
|
+
|
134
|
+
df_ArSenTD.columns = [DATA_COLUMN, LABEL_COLUMN]
|
135
|
+
|
136
|
+
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
137
|
+
lambda x: 0 if (x == "very_negative") else x
|
138
|
+
)
|
139
|
+
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
140
|
+
lambda x: 1 if (x == "negative") else x
|
141
|
+
)
|
142
|
+
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
143
|
+
lambda x: 2 if (x == "neutral") else x
|
144
|
+
)
|
145
|
+
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
146
|
+
lambda x: 3 if (x == "positive") else x
|
147
|
+
)
|
148
|
+
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
149
|
+
lambda x: 4 if (x == "very_positive") else x
|
150
|
+
)
|
151
|
+
df_ArSenTD["text"] = df_ArSenTD["text"].progress_apply(
|
152
|
+
lambda x: arabert_prep.preprocess(
|
153
|
+
x
|
154
|
+
)
|
155
|
+
)
|
156
|
+
label_list_ArSenTD = [0, 1, 2, 3, 4]
|
157
|
+
|
158
|
+
train_ArSenTD, test_ArSenTD = train_test_split(
|
159
|
+
df_ArSenTD, test_size=0.2, random_state=42
|
160
|
+
)
|
161
|
+
|
162
|
+
data_ArSenTD = Dataset("ArSenTD-LEV", train_ArSenTD, test_ArSenTD, label_list_ArSenTD)
|
163
|
+
all_datasets.append(data_ArSenTD)
|
164
|
+
|
165
|
+
#%%
|
166
|
+
# *************AJGT************
|
167
|
+
df_AJGT = pd.read_excel("Datasets\\Ajgt\\AJGT.xlsx", header=0)
|
168
|
+
|
169
|
+
df_AJGT = df_AJGT[["Feed", "Sentiment"]]
|
170
|
+
df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]
|
171
|
+
|
172
|
+
df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
|
173
|
+
lambda x: 0 if (x == "Negative") else x
|
174
|
+
)
|
175
|
+
df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
|
176
|
+
lambda x: 1 if (x == "Positive") else x
|
177
|
+
)
|
178
|
+
df_AJGT["text"] = df_AJGT["text"].progress_apply(
|
179
|
+
lambda x: arabert_prep.preprocess(
|
180
|
+
x
|
181
|
+
)
|
182
|
+
)
|
183
|
+
train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42)
|
184
|
+
label_list_AJGT = [0, 1]
|
185
|
+
|
186
|
+
data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
|
187
|
+
all_datasets.append(data_AJGT)
|
188
|
+
#%%
|
189
|
+
# *************LABR-UN-Binary************
|
190
|
+
from labr import LABR
|
191
|
+
|
192
|
+
labr_helper = LABR()
|
193
|
+
|
194
|
+
(d_train, y_train, d_test, y_test) = labr_helper.get_train_test(
|
195
|
+
klass="2", balanced="unbalanced"
|
196
|
+
)
|
197
|
+
|
198
|
+
train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
|
199
|
+
test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})
|
200
|
+
|
201
|
+
train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
|
202
|
+
lambda x: arabert_prep.preprocess(
|
203
|
+
x
|
204
|
+
)
|
205
|
+
)
|
206
|
+
test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
|
207
|
+
lambda x: arabert_prep.preprocess(
|
208
|
+
x
|
209
|
+
)
|
210
|
+
)
|
211
|
+
label_list_LABR_B_U = [0, 1]
|
212
|
+
|
213
|
+
data_LABR_B_U = Dataset(
|
214
|
+
"LABR-UN-Binary", train_LABR_B_U, test_LABR_B_U, label_list_LABR_B_U
|
215
|
+
)
|
216
|
+
# all_datasets.append(data_LABR_B_U)
|
217
|
+
|
218
|
+
#%%
|
219
|
+
for data in tqdm(all_datasets):
|
220
|
+
# Use the InputExample class from BERT's run_classifier code to create examples from the data
|
221
|
+
data.train_InputExamples = data.train.apply(
|
222
|
+
lambda x: run_classifier.InputExample(
|
223
|
+
guid=None, # Globally unique ID for bookkeeping, unused in this example
|
224
|
+
text_a=x[DATA_COLUMN],
|
225
|
+
text_b=None,
|
226
|
+
label=x[LABEL_COLUMN],
|
227
|
+
),
|
228
|
+
axis=1,
|
229
|
+
)
|
230
|
+
|
231
|
+
data.test_InputExamples = data.test.apply(
|
232
|
+
lambda x: run_classifier.InputExample(
|
233
|
+
guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]
|
234
|
+
),
|
235
|
+
axis=1,
|
236
|
+
)
|
237
|
+
#%%
|
238
|
+
# We'll set sequences to be at most 128 tokens long.
|
239
|
+
MAX_SEQ_LENGTH = 256
|
240
|
+
|
241
|
+
VOC_FNAME = "./64000_vocab_sp_70m.txt"
|
242
|
+
tokenizer = FullTokenizer(VOC_FNAME)
|
243
|
+
|
244
|
+
for data in tqdm(all_datasets):
|
245
|
+
# Convert our train and test features to InputFeatures that BERT understands.
|
246
|
+
data.train_features = run_classifier.convert_examples_to_features(
|
247
|
+
data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
|
248
|
+
)
|
249
|
+
data.test_features = run_classifier.convert_examples_to_features(
|
250
|
+
data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
|
251
|
+
)
|
252
|
+
|
253
|
+
# %%
|
254
|
+
import pickle
|
255
|
+
|
256
|
+
with open("all_datasets_64k_farasa_256.pickle", "wb") as fp: # Pickling
|
257
|
+
pickle.dump(all_datasets, fp)
|
258
|
+
|
259
|
+
|
260
|
+
# %%
|