SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,260 +0,0 @@
1
- # Scripts used to pre_process and create the data for classifier evaluation
2
- #%%
3
- import pandas as pd
4
- from sklearn.model_selection import train_test_split
5
-
6
- import sys
7
- sys.path.append("..")
8
-
9
- from arabert.preprocess import ArabertPreprocessor
10
-
11
-
12
- from tqdm import tqdm
13
-
14
- tqdm.pandas()
15
-
16
- from tokenization import FullTokenizer
17
- from run_classifier import input_fn_builder, model_fn_builder
18
-
19
-
20
- model_name = "bert-base-arabert"
21
- arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)
22
-
23
-
24
- class Dataset:
25
- def __init__(
26
- self,
27
- name,
28
- train,
29
- test,
30
- label_list,
31
- train_InputExamples=None,
32
- test_InputExamples=None,
33
- train_features=None,
34
- test_features=None,
35
- ):
36
- self.name = name
37
- self.train = train
38
- self.test = test
39
- self.label_list = label_list
40
- self.train_InputExamples = train_InputExamples
41
- self.test_InputExamples = test_InputExamples
42
- self.train_features = train_features
43
- self.test_features = test_features
44
-
45
-
46
- all_datasets = []
47
- #%%
48
- # *************HARD************
49
- df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0)
50
-
51
- df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only
52
- # code rating as +ve if > 3, -ve if less, no 3s in dataset
53
- df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1)
54
- # rename columns to fit default constructor in fastai
55
- df_HARD.columns = ["label", "text"]
56
- df_HARD["text"] = df_HARD["text"].progress_apply(
57
- lambda x: arabert_prep.preprocess(
58
- x
59
- )
60
- )
61
- train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
62
- label_list_HARD = [0, 1]
63
-
64
- data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
65
- all_datasets.append(data_Hard)
66
-
67
- #%%
68
- # *************ASTD-Unbalanced************
69
- df_ASTD_UN = pd.read_csv(
70
- "Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None
71
- )
72
-
73
- DATA_COLUMN = "text"
74
- LABEL_COLUMN = "label"
75
- df_ASTD_UN.columns = [DATA_COLUMN, LABEL_COLUMN]
76
-
77
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
78
- lambda x: 0 if (x == "NEG") else x
79
- )
80
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
81
- lambda x: 1 if (x == "POS") else x
82
- )
83
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
84
- lambda x: 2 if (x == "NEUTRAL") else x
85
- )
86
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
87
- lambda x: 3 if (x == "OBJ") else x
88
- )
89
- df_ASTD_UN["text"] = df_ASTD_UN["text"].progress_apply(
90
- lambda x: arabert_prep.preprocess(
91
- x
92
- )
93
- )
94
- train_ASTD_UN, test_ASTD_UN = train_test_split(
95
- df_ASTD_UN, test_size=0.2, random_state=42
96
- )
97
- label_list_ASTD_UN = [0, 1, 2, 3]
98
-
99
- data_ASTD_UN = Dataset(
100
- "ASTD-Unbalanced", train_ASTD_UN, test_ASTD_UN, label_list_ASTD_UN
101
- )
102
- all_datasets.append(data_ASTD_UN)
103
- #%%
104
- # *************ASTD-Dahou-Balanced************
105
-
106
- df_ASTD_B = pd.read_csv(
107
- "Datasets\\Dahou\\data_csv_balanced\\ASTD-balanced-not-linked.csv",
108
- sep=",",
109
- header=0,
110
- )
111
-
112
- df_ASTD_B.columns = [DATA_COLUMN, LABEL_COLUMN]
113
-
114
- df_ASTD_B[LABEL_COLUMN] = df_ASTD_B[LABEL_COLUMN].apply(lambda x: 0 if (x == -1) else x)
115
- df_ASTD_B["text"] = df_ASTD_B["text"].progress_apply(
116
- lambda x: arabert_prep.preprocess(
117
- x
118
- )
119
- )
120
- train_ASTD_B, test_ASTD_B = train_test_split(df_ASTD_B, test_size=0.2, random_state=42)
121
- label_list_ASTD_B = [0, 1]
122
-
123
- data_ASTD_B = Dataset(
124
- "ASTD-Dahou-Balanced", train_ASTD_B, test_ASTD_B, label_list_ASTD_B
125
- )
126
- all_datasets.append(data_ASTD_B)
127
-
128
- #%%
129
- # *************ArSenTD-LEV************
130
- df_ArSenTD = pd.read_csv(
131
- "Datasets\\ArSenTD-LEV\\ArSenTD-LEV-processed-no-emojis2.csv", sep=",", header=0
132
- )
133
-
134
- df_ArSenTD.columns = [DATA_COLUMN, LABEL_COLUMN]
135
-
136
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
137
- lambda x: 0 if (x == "very_negative") else x
138
- )
139
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
140
- lambda x: 1 if (x == "negative") else x
141
- )
142
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
143
- lambda x: 2 if (x == "neutral") else x
144
- )
145
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
146
- lambda x: 3 if (x == "positive") else x
147
- )
148
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
149
- lambda x: 4 if (x == "very_positive") else x
150
- )
151
- df_ArSenTD["text"] = df_ArSenTD["text"].progress_apply(
152
- lambda x: arabert_prep.preprocess(
153
- x
154
- )
155
- )
156
- label_list_ArSenTD = [0, 1, 2, 3, 4]
157
-
158
- train_ArSenTD, test_ArSenTD = train_test_split(
159
- df_ArSenTD, test_size=0.2, random_state=42
160
- )
161
-
162
- data_ArSenTD = Dataset("ArSenTD-LEV", train_ArSenTD, test_ArSenTD, label_list_ArSenTD)
163
- all_datasets.append(data_ArSenTD)
164
-
165
- #%%
166
- # *************AJGT************
167
- df_AJGT = pd.read_excel("Datasets\\Ajgt\\AJGT.xlsx", header=0)
168
-
169
- df_AJGT = df_AJGT[["Feed", "Sentiment"]]
170
- df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]
171
-
172
- df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
173
- lambda x: 0 if (x == "Negative") else x
174
- )
175
- df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
176
- lambda x: 1 if (x == "Positive") else x
177
- )
178
- df_AJGT["text"] = df_AJGT["text"].progress_apply(
179
- lambda x: arabert_prep.preprocess(
180
- x
181
- )
182
- )
183
- train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42)
184
- label_list_AJGT = [0, 1]
185
-
186
- data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
187
- all_datasets.append(data_AJGT)
188
- #%%
189
- # *************LABR-UN-Binary************
190
- from labr import LABR
191
-
192
- labr_helper = LABR()
193
-
194
- (d_train, y_train, d_test, y_test) = labr_helper.get_train_test(
195
- klass="2", balanced="unbalanced"
196
- )
197
-
198
- train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
199
- test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})
200
-
201
- train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
202
- lambda x: arabert_prep.preprocess(
203
- x
204
- )
205
- )
206
- test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
207
- lambda x: arabert_prep.preprocess(
208
- x
209
- )
210
- )
211
- label_list_LABR_B_U = [0, 1]
212
-
213
- data_LABR_B_U = Dataset(
214
- "LABR-UN-Binary", train_LABR_B_U, test_LABR_B_U, label_list_LABR_B_U
215
- )
216
- # all_datasets.append(data_LABR_B_U)
217
-
218
- #%%
219
- for data in tqdm(all_datasets):
220
- # Use the InputExample class from BERT's run_classifier code to create examples from the data
221
- data.train_InputExamples = data.train.apply(
222
- lambda x: run_classifier.InputExample(
223
- guid=None, # Globally unique ID for bookkeeping, unused in this example
224
- text_a=x[DATA_COLUMN],
225
- text_b=None,
226
- label=x[LABEL_COLUMN],
227
- ),
228
- axis=1,
229
- )
230
-
231
- data.test_InputExamples = data.test.apply(
232
- lambda x: run_classifier.InputExample(
233
- guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]
234
- ),
235
- axis=1,
236
- )
237
- #%%
238
- # We'll set sequences to be at most 128 tokens long.
239
- MAX_SEQ_LENGTH = 256
240
-
241
- VOC_FNAME = "./64000_vocab_sp_70m.txt"
242
- tokenizer = FullTokenizer(VOC_FNAME)
243
-
244
- for data in tqdm(all_datasets):
245
- # Convert our train and test features to InputFeatures that BERT understands.
246
- data.train_features = run_classifier.convert_examples_to_features(
247
- data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
248
- )
249
- data.test_features = run_classifier.convert_examples_to_features(
250
- data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
251
- )
252
-
253
- # %%
254
- import pickle
255
-
256
- with open("all_datasets_64k_farasa_256.pickle", "wb") as fp: # Pickling
257
- pickle.dump(all_datasets, fp)
258
-
259
-
260
- # %%