SinaTools 0.1.41__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.41.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.41.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.41.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,163 +1,163 @@
|
|
1
|
-
import os
|
2
|
-
import logging
|
3
|
-
import torch
|
4
|
-
import numpy as np
|
5
|
-
from sinatools.ner.trainers import BaseTrainer
|
6
|
-
from sinatools.ner.metrics import compute_single_label_metrics
|
7
|
-
|
8
|
-
logger = logging.getLogger(__name__)
|
9
|
-
|
10
|
-
|
11
|
-
class BertTrainer(BaseTrainer):
|
12
|
-
def __init__(self, **kwargs):
|
13
|
-
super().__init__(**kwargs)
|
14
|
-
|
15
|
-
def train(self):
|
16
|
-
best_val_loss, test_loss = np.inf, np.inf
|
17
|
-
num_train_batch = len(self.train_dataloader)
|
18
|
-
patience = self.patience
|
19
|
-
|
20
|
-
for epoch_index in range(self.max_epochs):
|
21
|
-
self.current_epoch = epoch_index
|
22
|
-
train_loss = 0
|
23
|
-
|
24
|
-
for batch_index, (_, gold_tags, _, _, logits) in enumerate(self.tag(
|
25
|
-
self.train_dataloader, is_train=True
|
26
|
-
), 1):
|
27
|
-
self.current_timestep += 1
|
28
|
-
batch_loss = self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
|
29
|
-
batch_loss.backward()
|
30
|
-
|
31
|
-
# Avoid exploding gradient by doing gradient clipping
|
32
|
-
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
|
33
|
-
|
34
|
-
self.optimizer.step()
|
35
|
-
self.scheduler.step()
|
36
|
-
train_loss += batch_loss.item()
|
37
|
-
|
38
|
-
if self.current_timestep % self.log_interval == 0:
|
39
|
-
logger.info(
|
40
|
-
"Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
|
41
|
-
epoch_index,
|
42
|
-
batch_index,
|
43
|
-
num_train_batch,
|
44
|
-
self.current_timestep,
|
45
|
-
self.optimizer.param_groups[0]['lr'],
|
46
|
-
batch_loss.item()
|
47
|
-
)
|
48
|
-
|
49
|
-
train_loss /= num_train_batch
|
50
|
-
|
51
|
-
logger.info("** Evaluating on validation dataset **")
|
52
|
-
val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
|
53
|
-
val_metrics = compute_single_label_metrics(segments)
|
54
|
-
|
55
|
-
epoch_summary_loss = {
|
56
|
-
"train_loss": train_loss,
|
57
|
-
"val_loss": val_loss
|
58
|
-
}
|
59
|
-
epoch_summary_metrics = {
|
60
|
-
"val_micro_f1": val_metrics.micro_f1,
|
61
|
-
"val_precision": val_metrics.precision,
|
62
|
-
"val_recall": val_metrics.recall
|
63
|
-
}
|
64
|
-
|
65
|
-
logger.info(
|
66
|
-
"Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
|
67
|
-
epoch_index,
|
68
|
-
self.current_timestep,
|
69
|
-
train_loss,
|
70
|
-
val_loss,
|
71
|
-
val_metrics.micro_f1
|
72
|
-
)
|
73
|
-
|
74
|
-
if val_loss < best_val_loss:
|
75
|
-
patience = self.patience
|
76
|
-
best_val_loss = val_loss
|
77
|
-
logger.info("** Validation improved, evaluating test data **")
|
78
|
-
test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
|
79
|
-
self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
|
80
|
-
test_metrics = compute_single_label_metrics(segments)
|
81
|
-
|
82
|
-
epoch_summary_loss["test_loss"] = test_loss
|
83
|
-
epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
|
84
|
-
epoch_summary_metrics["test_precision"] = test_metrics.precision
|
85
|
-
epoch_summary_metrics["test_recall"] = test_metrics.recall
|
86
|
-
|
87
|
-
logger.info(
|
88
|
-
f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
|
89
|
-
epoch_index,
|
90
|
-
self.current_timestep,
|
91
|
-
test_loss,
|
92
|
-
test_metrics.micro_f1
|
93
|
-
)
|
94
|
-
|
95
|
-
self.save()
|
96
|
-
else:
|
97
|
-
patience -= 1
|
98
|
-
|
99
|
-
# No improvements, terminating early
|
100
|
-
if patience == 0:
|
101
|
-
logger.info("Early termination triggered")
|
102
|
-
break
|
103
|
-
|
104
|
-
self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
|
105
|
-
self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
|
106
|
-
|
107
|
-
def eval(self, dataloader):
|
108
|
-
golds, preds, segments, valid_lens = list(), list(), list(), list()
|
109
|
-
loss = 0
|
110
|
-
|
111
|
-
for _, gold_tags, tokens, valid_len, logits in self.tag(
|
112
|
-
dataloader, is_train=False
|
113
|
-
):
|
114
|
-
loss += self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
|
115
|
-
preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
|
116
|
-
segments += tokens
|
117
|
-
valid_lens += list(valid_len)
|
118
|
-
|
119
|
-
loss /= len(dataloader)
|
120
|
-
|
121
|
-
# Update segments, attach predicted tags to each token
|
122
|
-
segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
|
123
|
-
|
124
|
-
return preds, segments, valid_lens, loss.item()
|
125
|
-
|
126
|
-
def infer(self, dataloader):
|
127
|
-
golds, preds, segments, valid_lens = list(), list(), list(), list()
|
128
|
-
|
129
|
-
for _, gold_tags, tokens, valid_len, logits in self.tag(
|
130
|
-
dataloader, is_train=False
|
131
|
-
):
|
132
|
-
preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
|
133
|
-
segments += tokens
|
134
|
-
valid_lens += list(valid_len)
|
135
|
-
|
136
|
-
segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
|
137
|
-
return segments
|
138
|
-
|
139
|
-
def to_segments(self, segments, preds, valid_lens, vocab):
|
140
|
-
if vocab is None:
|
141
|
-
vocab = self.vocab
|
142
|
-
|
143
|
-
tagged_segments = list()
|
144
|
-
tokens_stoi = vocab.tokens.get_stoi()
|
145
|
-
tags_itos = vocab.tags[0].get_itos()
|
146
|
-
unk_id = tokens_stoi["UNK"]
|
147
|
-
|
148
|
-
for segment, pred, valid_len in zip(segments, preds, valid_lens):
|
149
|
-
# First, the token at 0th index [CLS] and token at nth index [SEP]
|
150
|
-
# Combine the tokens with their corresponding predictions
|
151
|
-
segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
|
152
|
-
|
153
|
-
# Ignore the sub-tokens/subwords, which are identified with text being UNK
|
154
|
-
segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
|
155
|
-
|
156
|
-
# Attach the predicted tags to each token
|
157
|
-
list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": tags_itos[t[1]]}]), segment_pred))
|
158
|
-
|
159
|
-
# We are only interested in the tagged tokens, we do no longer need raw model predictions
|
160
|
-
tagged_segment = [t for t, _ in segment_pred]
|
161
|
-
tagged_segments.append(tagged_segment)
|
162
|
-
|
163
|
-
return tagged_segments
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import torch
|
4
|
+
import numpy as np
|
5
|
+
from sinatools.ner.trainers import BaseTrainer
|
6
|
+
from sinatools.ner.metrics import compute_single_label_metrics
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class BertTrainer(BaseTrainer):
|
12
|
+
def __init__(self, **kwargs):
|
13
|
+
super().__init__(**kwargs)
|
14
|
+
|
15
|
+
def train(self):
|
16
|
+
best_val_loss, test_loss = np.inf, np.inf
|
17
|
+
num_train_batch = len(self.train_dataloader)
|
18
|
+
patience = self.patience
|
19
|
+
|
20
|
+
for epoch_index in range(self.max_epochs):
|
21
|
+
self.current_epoch = epoch_index
|
22
|
+
train_loss = 0
|
23
|
+
|
24
|
+
for batch_index, (_, gold_tags, _, _, logits) in enumerate(self.tag(
|
25
|
+
self.train_dataloader, is_train=True
|
26
|
+
), 1):
|
27
|
+
self.current_timestep += 1
|
28
|
+
batch_loss = self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
|
29
|
+
batch_loss.backward()
|
30
|
+
|
31
|
+
# Avoid exploding gradient by doing gradient clipping
|
32
|
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
|
33
|
+
|
34
|
+
self.optimizer.step()
|
35
|
+
self.scheduler.step()
|
36
|
+
train_loss += batch_loss.item()
|
37
|
+
|
38
|
+
if self.current_timestep % self.log_interval == 0:
|
39
|
+
logger.info(
|
40
|
+
"Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
|
41
|
+
epoch_index,
|
42
|
+
batch_index,
|
43
|
+
num_train_batch,
|
44
|
+
self.current_timestep,
|
45
|
+
self.optimizer.param_groups[0]['lr'],
|
46
|
+
batch_loss.item()
|
47
|
+
)
|
48
|
+
|
49
|
+
train_loss /= num_train_batch
|
50
|
+
|
51
|
+
logger.info("** Evaluating on validation dataset **")
|
52
|
+
val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
|
53
|
+
val_metrics = compute_single_label_metrics(segments)
|
54
|
+
|
55
|
+
epoch_summary_loss = {
|
56
|
+
"train_loss": train_loss,
|
57
|
+
"val_loss": val_loss
|
58
|
+
}
|
59
|
+
epoch_summary_metrics = {
|
60
|
+
"val_micro_f1": val_metrics.micro_f1,
|
61
|
+
"val_precision": val_metrics.precision,
|
62
|
+
"val_recall": val_metrics.recall
|
63
|
+
}
|
64
|
+
|
65
|
+
logger.info(
|
66
|
+
"Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
|
67
|
+
epoch_index,
|
68
|
+
self.current_timestep,
|
69
|
+
train_loss,
|
70
|
+
val_loss,
|
71
|
+
val_metrics.micro_f1
|
72
|
+
)
|
73
|
+
|
74
|
+
if val_loss < best_val_loss:
|
75
|
+
patience = self.patience
|
76
|
+
best_val_loss = val_loss
|
77
|
+
logger.info("** Validation improved, evaluating test data **")
|
78
|
+
test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
|
79
|
+
self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
|
80
|
+
test_metrics = compute_single_label_metrics(segments)
|
81
|
+
|
82
|
+
epoch_summary_loss["test_loss"] = test_loss
|
83
|
+
epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
|
84
|
+
epoch_summary_metrics["test_precision"] = test_metrics.precision
|
85
|
+
epoch_summary_metrics["test_recall"] = test_metrics.recall
|
86
|
+
|
87
|
+
logger.info(
|
88
|
+
f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
|
89
|
+
epoch_index,
|
90
|
+
self.current_timestep,
|
91
|
+
test_loss,
|
92
|
+
test_metrics.micro_f1
|
93
|
+
)
|
94
|
+
|
95
|
+
self.save()
|
96
|
+
else:
|
97
|
+
patience -= 1
|
98
|
+
|
99
|
+
# No improvements, terminating early
|
100
|
+
if patience == 0:
|
101
|
+
logger.info("Early termination triggered")
|
102
|
+
break
|
103
|
+
|
104
|
+
self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
|
105
|
+
self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
|
106
|
+
|
107
|
+
def eval(self, dataloader):
|
108
|
+
golds, preds, segments, valid_lens = list(), list(), list(), list()
|
109
|
+
loss = 0
|
110
|
+
|
111
|
+
for _, gold_tags, tokens, valid_len, logits in self.tag(
|
112
|
+
dataloader, is_train=False
|
113
|
+
):
|
114
|
+
loss += self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
|
115
|
+
preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
|
116
|
+
segments += tokens
|
117
|
+
valid_lens += list(valid_len)
|
118
|
+
|
119
|
+
loss /= len(dataloader)
|
120
|
+
|
121
|
+
# Update segments, attach predicted tags to each token
|
122
|
+
segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
|
123
|
+
|
124
|
+
return preds, segments, valid_lens, loss.item()
|
125
|
+
|
126
|
+
def infer(self, dataloader):
|
127
|
+
golds, preds, segments, valid_lens = list(), list(), list(), list()
|
128
|
+
|
129
|
+
for _, gold_tags, tokens, valid_len, logits in self.tag(
|
130
|
+
dataloader, is_train=False
|
131
|
+
):
|
132
|
+
preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
|
133
|
+
segments += tokens
|
134
|
+
valid_lens += list(valid_len)
|
135
|
+
|
136
|
+
segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
|
137
|
+
return segments
|
138
|
+
|
139
|
+
def to_segments(self, segments, preds, valid_lens, vocab):
|
140
|
+
if vocab is None:
|
141
|
+
vocab = self.vocab
|
142
|
+
|
143
|
+
tagged_segments = list()
|
144
|
+
tokens_stoi = vocab.tokens.get_stoi()
|
145
|
+
tags_itos = vocab.tags[0].get_itos()
|
146
|
+
unk_id = tokens_stoi["UNK"]
|
147
|
+
|
148
|
+
for segment, pred, valid_len in zip(segments, preds, valid_lens):
|
149
|
+
# First, the token at 0th index [CLS] and token at nth index [SEP]
|
150
|
+
# Combine the tokens with their corresponding predictions
|
151
|
+
segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
|
152
|
+
|
153
|
+
# Ignore the sub-tokens/subwords, which are identified with text being UNK
|
154
|
+
segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
|
155
|
+
|
156
|
+
# Attach the predicted tags to each token
|
157
|
+
list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": tags_itos[t[1]]}]), segment_pred))
|
158
|
+
|
159
|
+
# We are only interested in the tagged tokens, we do no longer need raw model predictions
|
160
|
+
tagged_segment = [t for t, _ in segment_pred]
|
161
|
+
tagged_segments.append(tagged_segment)
|
162
|
+
|
163
|
+
return tagged_segments
|
@@ -1,3 +1,3 @@
|
|
1
|
-
from sinatools.ner.trainers.BaseTrainer import BaseTrainer
|
2
|
-
from sinatools.ner.trainers.BertTrainer import BertTrainer
|
1
|
+
from sinatools.ner.trainers.BaseTrainer import BaseTrainer
|
2
|
+
from sinatools.ner.trainers.BertTrainer import BertTrainer
|
3
3
|
from sinatools.ner.trainers.BertNestedTrainer import BertNestedTrainer
|
@@ -1,123 +0,0 @@
|
|
1
|
-
SinaTools-0.1.41.data/data/sinatools/environment.yml,sha256=i0UFZc-vwU9ZwnI8hBdz7vi-x22vG-HR8ojWBUAOkno,5422
|
2
|
-
sinatools/VERSION,sha256=MoDOfoT2nFdWJzX_rU2mpf_gRhWTZlYybGVGv8AKRLk,6
|
3
|
-
sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
|
4
|
-
sinatools/environment.yml,sha256=i0UFZc-vwU9ZwnI8hBdz7vi-x22vG-HR8ojWBUAOkno,5422
|
5
|
-
sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
|
-
sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
|
-
sinatools/CLI/DataDownload/download_files.py,sha256=EezvbukR3pZ8s6mGZnzTcjsbo3CBDlC0g6KhJWlYp1w,2686
|
8
|
-
sinatools/CLI/morphology/ALMA_multi_word.py,sha256=rmpa72twwIJHme_kpQ1lu3_7y_Jorj70QTvOnQMJRuI,1274
|
9
|
-
sinatools/CLI/morphology/morph_analyzer.py,sha256=HPamEKos_JRYCJv_2q6c12N--da58_JXTno9haww5Ao,3497
|
10
|
-
sinatools/CLI/ner/corpus_entity_extractor.py,sha256=DdvigsDQzko5nJBjzUXlIDqoBMBTVzktjSo7JfEXTIA,4778
|
11
|
-
sinatools/CLI/ner/entity_extractor.py,sha256=G9j-t0WKm2CRORhqARJM-pI-KArQ2IXIvnBK_NHxlHs,2885
|
12
|
-
sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
sinatools/CLI/utils/arStrip.py,sha256=NLyp8vOu2xv80tL9jiKRvyptmbkRZVg-wcAr-9YyvNY,3264
|
14
|
-
sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
|
15
|
-
sinatools/CLI/utils/implication.py,sha256=AojpkCwUQJiQjxhyEUWKRHmBnIt1tVqr485cAF7Thq0,2857
|
16
|
-
sinatools/CLI/utils/jaccard.py,sha256=w56N_cNEFJ0A7WtunmY_xtms4srFagKBzrW_0YhH2DE,4216
|
17
|
-
sinatools/CLI/utils/remove_latin.py,sha256=NOaTm2RHxt5IQrV98ySTmD8rTXTmcqSmfbPAwTyaXqU,848
|
18
|
-
sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
|
19
|
-
sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1vnRL0oYCSfqw,2823
|
20
|
-
sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
|
21
|
-
sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
|
22
|
-
sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
sinatools/DataDownload/downloader.py,sha256=VdUNgSqMKz1J-DuQD_eS1U2KWqEpy94WlSJ0pPODLig,7833
|
24
|
-
sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
25
|
-
sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
|
26
|
-
sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
|
27
|
-
sinatools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
|
28
|
-
sinatools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
|
29
|
-
sinatools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
|
30
|
-
sinatools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
31
|
-
sinatools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
|
32
|
-
sinatools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
|
33
|
-
sinatools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
|
34
|
-
sinatools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
|
35
|
-
sinatools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
|
36
|
-
sinatools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
|
37
|
-
sinatools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
38
|
-
sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
|
39
|
-
sinatools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
|
40
|
-
sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
|
41
|
-
sinatools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
|
42
|
-
sinatools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
|
43
|
-
sinatools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
|
44
|
-
sinatools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
|
45
|
-
sinatools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
|
46
|
-
sinatools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
47
|
-
sinatools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
|
48
|
-
sinatools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
|
49
|
-
sinatools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
|
50
|
-
sinatools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
|
51
|
-
sinatools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
|
52
|
-
sinatools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
53
|
-
sinatools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
|
54
|
-
sinatools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
|
55
|
-
sinatools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
|
56
|
-
sinatools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
57
|
-
sinatools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
|
58
|
-
sinatools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
|
59
|
-
sinatools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
60
|
-
sinatools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
|
61
|
-
sinatools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
|
62
|
-
sinatools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
63
|
-
sinatools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
|
64
|
-
sinatools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
|
65
|
-
sinatools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
66
|
-
sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
67
|
-
sinatools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
|
68
|
-
sinatools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
|
69
|
-
sinatools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
|
-
sinatools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
|
71
|
-
sinatools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
|
72
|
-
sinatools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
|
73
|
-
sinatools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
|
74
|
-
sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
|
75
|
-
sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
|
76
|
-
sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
|
77
|
-
sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
|
78
|
-
sinatools/morphology/morph_analyzer.py,sha256=JOH2UWKNQWo5UzpWNzP9R1D3B3qLSogIiMp8n0N_56o,7177
|
79
|
-
sinatools/ner/__init__.py,sha256=59kLMX6UQhF6JpE10RhaDYC3a2_jiWOIVPuejsoflFE,1050
|
80
|
-
sinatools/ner/data_format.py,sha256=VmFshZbEPOsWxsb4tgSkwvbM1k7yCce4kmtPkCiWgwM,4513
|
81
|
-
sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
|
82
|
-
sinatools/ner/entity_extractor.py,sha256=O2epRwRFUUcQs3SnFIYHVBI4zVhr8hRcj0XJYeby4ts,3588
|
83
|
-
sinatools/ner/helpers.py,sha256=sX6ezVbuVQxk_xJqZwhUzJVFVuVmFGmei_kd6r3sPHE,3652
|
84
|
-
sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
|
85
|
-
sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
|
86
|
-
sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
|
87
|
-
sinatools/ner/data/datasets.py,sha256=_uUlvBAhnTtPwKLj0wIbmB04VCBidfwffxKorLGHq_g,5134
|
88
|
-
sinatools/ner/data/transforms.py,sha256=URMz1dHzkHjgUGAkDOenCWvQThO1ha8XeQVjoLL9RXM,4874
|
89
|
-
sinatools/ner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
|
90
|
-
sinatools/ner/nn/BertNestedTagger.py,sha256=_fwAn1kiKmXe6m5y16Ipty3kvXIEFEmiUq74Ad1818U,1219
|
91
|
-
sinatools/ner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
|
92
|
-
sinatools/ner/nn/__init__.py,sha256=UgQD_XLNzQGBNSYc_Bw1aRJZjq4PJsnMT1iZwnJemqE,170
|
93
|
-
sinatools/ner/trainers/BaseTrainer.py,sha256=Uar8HxtgBXCVhKa85sEN622d9P7JiFBcWfs46uRG4aA,4068
|
94
|
-
sinatools/ner/trainers/BertNestedTrainer.py,sha256=iJOah69tXZsAXBimqP0odEsk8SPX4A355riePzW2BFs,8632
|
95
|
-
sinatools/ner/trainers/BertTrainer.py,sha256=BtttsrHPolmK3eRDqrgVUuv6lVMuImIeskxhi02Q-44,6596
|
96
|
-
sinatools/ner/trainers/__init__.py,sha256=Xnbi_M4KKJRqV7FJe1vklyT0nEW2Q2obxgcWkbR0ZbA,190
|
97
|
-
sinatools/relations/__init__.py,sha256=cYjsP2mlTYvAwVIEFtgA6i9gLUSkGVOuDggMs7TvG5k,272
|
98
|
-
sinatools/relations/relation_extractor.py,sha256=UuDlaaR0ch9BFv4sBF1tr7P-P9xq8oRZF41tAze6_ok,9751
|
99
|
-
sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
|
100
|
-
sinatools/semantic_relatedness/compute_relatedness.py,sha256=_9HFPs3nQBLklHFfkc9o3gEjEI6Bd34Ha4E1Kvv1RIg,2256
|
101
|
-
sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
|
102
|
-
sinatools/synonyms/synonyms_generator.py,sha256=jRd0D3_kn-jYBaZzqY-7oOy0SFjSJ-mjM7JhsySzX58,9037
|
103
|
-
sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
104
|
-
sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
|
105
|
-
sinatools/utils/parser.py,sha256=qvHdln5R5CAv_0UOJWe0mcp8JCsGqgazoeIIkoALH88,6259
|
106
|
-
sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
|
107
|
-
sinatools/utils/similarity.py,sha256=HAK6OmyVnfjPm0GWL3z9s4ZoUwpZHVKxt3CeSMfqLIQ,11990
|
108
|
-
sinatools/utils/text_dublication_detector.py,sha256=FeSkbfWGMQluz23H4CBHXION-walZPgjueX6AL8u_Q0,5660
|
109
|
-
sinatools/utils/text_transliteration.py,sha256=F3smhr2AEJtySE6wGQsiXXOslTvSDzLivTYu0btgc10,8769
|
110
|
-
sinatools/utils/tokenizer.py,sha256=nyk6lh5-p38wrU62hvh4wg7ni9ammkdqqIgcjbbBxxo,6965
|
111
|
-
sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
|
112
|
-
sinatools/utils/word_compare.py,sha256=rS2Z74sf7R-7MTXyrFj5miRi2TnSG9OdTDp_qQYuo2Y,28200
|
113
|
-
sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
|
114
|
-
sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
|
115
|
-
sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
|
116
|
-
sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
|
117
|
-
SinaTools-0.1.41.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
118
|
-
SinaTools-0.1.41.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
119
|
-
SinaTools-0.1.41.dist-info/METADATA,sha256=9zBmOUN3RovUR57RCZeJNWwTxjSqEmLSjka3SL04KZA,3410
|
120
|
-
SinaTools-0.1.41.dist-info/WHEEL,sha256=9Hm2OB-j1QcCUq9Jguht7ayGIIZBRTdOXD1qg9cCgPM,109
|
121
|
-
SinaTools-0.1.41.dist-info/entry_points.txt,sha256=_CsRKM_tSCWV5hefBNUsWf9_6DrJnzFlxeAo1wm5XqY,1302
|
122
|
-
SinaTools-0.1.41.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
|
123
|
-
SinaTools-0.1.41.dist-info/RECORD,,
|
@@ -1,14 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2018 The Google AI Language Team Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|