SinaTools 0.1.41__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  5. sinatools/ner/trainers/BertTrainer.py +163 -163
  6. sinatools/ner/trainers/__init__.py +2 -2
  7. SinaTools-0.1.41.dist-info/RECORD +0 -123
  8. sinatools/arabert/arabert/__init__.py +0 -14
  9. sinatools/arabert/arabert/create_classification_data.py +0 -260
  10. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  11. sinatools/arabert/arabert/extract_features.py +0 -444
  12. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  13. sinatools/arabert/arabert/modeling.py +0 -1027
  14. sinatools/arabert/arabert/optimization.py +0 -202
  15. sinatools/arabert/arabert/run_classifier.py +0 -1078
  16. sinatools/arabert/arabert/run_pretraining.py +0 -593
  17. sinatools/arabert/arabert/run_squad.py +0 -1440
  18. sinatools/arabert/arabert/tokenization.py +0 -414
  19. sinatools/arabert/araelectra/__init__.py +0 -1
  20. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  21. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  22. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  23. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  24. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  25. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  26. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  27. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  28. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  29. sinatools/arabert/araelectra/finetune/task.py +0 -74
  30. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  31. sinatools/arabert/araelectra/flops_computation.py +0 -215
  32. sinatools/arabert/araelectra/model/__init__.py +0 -14
  33. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  34. sinatools/arabert/araelectra/model/optimization.py +0 -193
  35. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  36. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  37. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  38. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  39. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  40. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  41. sinatools/arabert/araelectra/util/__init__.py +0 -14
  42. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  43. sinatools/arabert/araelectra/util/utils.py +0 -109
  44. sinatools/arabert/aragpt2/__init__.py +0 -2
  45. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  46. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  47. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  48. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  49. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  50. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  51. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  52. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  53. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  54. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  55. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  56. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  57. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  58. {SinaTools-0.1.41.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  59. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  60. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  61. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  62. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  63. {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,260 +0,0 @@
1
- # Scripts used to pre_process and create the data for classifier evaluation
2
- #%%
3
- import pandas as pd
4
- from sklearn.model_selection import train_test_split
5
-
6
- import sys
7
- sys.path.append("..")
8
-
9
- from arabert.preprocess import ArabertPreprocessor
10
-
11
-
12
- from tqdm import tqdm
13
-
14
- tqdm.pandas()
15
-
16
- from tokenization import FullTokenizer
17
- from run_classifier import input_fn_builder, model_fn_builder
18
-
19
-
20
- model_name = "bert-base-arabert"
21
- arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)
22
-
23
-
24
- class Dataset:
25
- def __init__(
26
- self,
27
- name,
28
- train,
29
- test,
30
- label_list,
31
- train_InputExamples=None,
32
- test_InputExamples=None,
33
- train_features=None,
34
- test_features=None,
35
- ):
36
- self.name = name
37
- self.train = train
38
- self.test = test
39
- self.label_list = label_list
40
- self.train_InputExamples = train_InputExamples
41
- self.test_InputExamples = test_InputExamples
42
- self.train_features = train_features
43
- self.test_features = test_features
44
-
45
-
46
- all_datasets = []
47
- #%%
48
- # *************HARD************
49
- df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0)
50
-
51
- df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only
52
- # code rating as +ve if > 3, -ve if less, no 3s in dataset
53
- df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1)
54
- # rename columns to fit default constructor in fastai
55
- df_HARD.columns = ["label", "text"]
56
- df_HARD["text"] = df_HARD["text"].progress_apply(
57
- lambda x: arabert_prep.preprocess(
58
- x
59
- )
60
- )
61
- train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
62
- label_list_HARD = [0, 1]
63
-
64
- data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
65
- all_datasets.append(data_Hard)
66
-
67
- #%%
68
- # *************ASTD-Unbalanced************
69
- df_ASTD_UN = pd.read_csv(
70
- "Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None
71
- )
72
-
73
- DATA_COLUMN = "text"
74
- LABEL_COLUMN = "label"
75
- df_ASTD_UN.columns = [DATA_COLUMN, LABEL_COLUMN]
76
-
77
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
78
- lambda x: 0 if (x == "NEG") else x
79
- )
80
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
81
- lambda x: 1 if (x == "POS") else x
82
- )
83
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
84
- lambda x: 2 if (x == "NEUTRAL") else x
85
- )
86
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
87
- lambda x: 3 if (x == "OBJ") else x
88
- )
89
- df_ASTD_UN["text"] = df_ASTD_UN["text"].progress_apply(
90
- lambda x: arabert_prep.preprocess(
91
- x
92
- )
93
- )
94
- train_ASTD_UN, test_ASTD_UN = train_test_split(
95
- df_ASTD_UN, test_size=0.2, random_state=42
96
- )
97
- label_list_ASTD_UN = [0, 1, 2, 3]
98
-
99
- data_ASTD_UN = Dataset(
100
- "ASTD-Unbalanced", train_ASTD_UN, test_ASTD_UN, label_list_ASTD_UN
101
- )
102
- all_datasets.append(data_ASTD_UN)
103
- #%%
104
- # *************ASTD-Dahou-Balanced************
105
-
106
- df_ASTD_B = pd.read_csv(
107
- "Datasets\\Dahou\\data_csv_balanced\\ASTD-balanced-not-linked.csv",
108
- sep=",",
109
- header=0,
110
- )
111
-
112
- df_ASTD_B.columns = [DATA_COLUMN, LABEL_COLUMN]
113
-
114
- df_ASTD_B[LABEL_COLUMN] = df_ASTD_B[LABEL_COLUMN].apply(lambda x: 0 if (x == -1) else x)
115
- df_ASTD_B["text"] = df_ASTD_B["text"].progress_apply(
116
- lambda x: arabert_prep.preprocess(
117
- x
118
- )
119
- )
120
- train_ASTD_B, test_ASTD_B = train_test_split(df_ASTD_B, test_size=0.2, random_state=42)
121
- label_list_ASTD_B = [0, 1]
122
-
123
- data_ASTD_B = Dataset(
124
- "ASTD-Dahou-Balanced", train_ASTD_B, test_ASTD_B, label_list_ASTD_B
125
- )
126
- all_datasets.append(data_ASTD_B)
127
-
128
- #%%
129
- # *************ArSenTD-LEV************
130
- df_ArSenTD = pd.read_csv(
131
- "Datasets\\ArSenTD-LEV\\ArSenTD-LEV-processed-no-emojis2.csv", sep=",", header=0
132
- )
133
-
134
- df_ArSenTD.columns = [DATA_COLUMN, LABEL_COLUMN]
135
-
136
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
137
- lambda x: 0 if (x == "very_negative") else x
138
- )
139
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
140
- lambda x: 1 if (x == "negative") else x
141
- )
142
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
143
- lambda x: 2 if (x == "neutral") else x
144
- )
145
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
146
- lambda x: 3 if (x == "positive") else x
147
- )
148
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
149
- lambda x: 4 if (x == "very_positive") else x
150
- )
151
- df_ArSenTD["text"] = df_ArSenTD["text"].progress_apply(
152
- lambda x: arabert_prep.preprocess(
153
- x
154
- )
155
- )
156
- label_list_ArSenTD = [0, 1, 2, 3, 4]
157
-
158
- train_ArSenTD, test_ArSenTD = train_test_split(
159
- df_ArSenTD, test_size=0.2, random_state=42
160
- )
161
-
162
- data_ArSenTD = Dataset("ArSenTD-LEV", train_ArSenTD, test_ArSenTD, label_list_ArSenTD)
163
- all_datasets.append(data_ArSenTD)
164
-
165
- #%%
166
- # *************AJGT************
167
- df_AJGT = pd.read_excel("Datasets\\Ajgt\\AJGT.xlsx", header=0)
168
-
169
- df_AJGT = df_AJGT[["Feed", "Sentiment"]]
170
- df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]
171
-
172
- df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
173
- lambda x: 0 if (x == "Negative") else x
174
- )
175
- df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
176
- lambda x: 1 if (x == "Positive") else x
177
- )
178
- df_AJGT["text"] = df_AJGT["text"].progress_apply(
179
- lambda x: arabert_prep.preprocess(
180
- x
181
- )
182
- )
183
- train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42)
184
- label_list_AJGT = [0, 1]
185
-
186
- data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
187
- all_datasets.append(data_AJGT)
188
- #%%
189
- # *************LABR-UN-Binary************
190
- from labr import LABR
191
-
192
- labr_helper = LABR()
193
-
194
- (d_train, y_train, d_test, y_test) = labr_helper.get_train_test(
195
- klass="2", balanced="unbalanced"
196
- )
197
-
198
- train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
199
- test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})
200
-
201
- train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
202
- lambda x: arabert_prep.preprocess(
203
- x
204
- )
205
- )
206
- test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
207
- lambda x: arabert_prep.preprocess(
208
- x
209
- )
210
- )
211
- label_list_LABR_B_U = [0, 1]
212
-
213
- data_LABR_B_U = Dataset(
214
- "LABR-UN-Binary", train_LABR_B_U, test_LABR_B_U, label_list_LABR_B_U
215
- )
216
- # all_datasets.append(data_LABR_B_U)
217
-
218
- #%%
219
- for data in tqdm(all_datasets):
220
- # Use the InputExample class from BERT's run_classifier code to create examples from the data
221
- data.train_InputExamples = data.train.apply(
222
- lambda x: run_classifier.InputExample(
223
- guid=None, # Globally unique ID for bookkeeping, unused in this example
224
- text_a=x[DATA_COLUMN],
225
- text_b=None,
226
- label=x[LABEL_COLUMN],
227
- ),
228
- axis=1,
229
- )
230
-
231
- data.test_InputExamples = data.test.apply(
232
- lambda x: run_classifier.InputExample(
233
- guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]
234
- ),
235
- axis=1,
236
- )
237
- #%%
238
- # We'll set sequences to be at most 128 tokens long.
239
- MAX_SEQ_LENGTH = 256
240
-
241
- VOC_FNAME = "./64000_vocab_sp_70m.txt"
242
- tokenizer = FullTokenizer(VOC_FNAME)
243
-
244
- for data in tqdm(all_datasets):
245
- # Convert our train and test features to InputFeatures that BERT understands.
246
- data.train_features = run_classifier.convert_examples_to_features(
247
- data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
248
- )
249
- data.test_features = run_classifier.convert_examples_to_features(
250
- data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
251
- )
252
-
253
- # %%
254
- import pickle
255
-
256
- with open("all_datasets_64k_farasa_256.pickle", "wb") as fp: # Pickling
257
- pickle.dump(all_datasets, fp)
258
-
259
-
260
- # %%