SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,323 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Fine-tunes an ELECTRA model on a downstream task."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import argparse
|
23
|
-
import collections
|
24
|
-
import json
|
25
|
-
|
26
|
-
import tensorflow as tf
|
27
|
-
|
28
|
-
import configure_finetuning
|
29
|
-
from finetune import preprocessing
|
30
|
-
from finetune import task_builder
|
31
|
-
from model import modeling
|
32
|
-
from model import optimization
|
33
|
-
from util import training_utils
|
34
|
-
from util import utils
|
35
|
-
|
36
|
-
|
37
|
-
class FinetuningModel(object):
|
38
|
-
"""Finetuning model with support for multi-task training."""
|
39
|
-
|
40
|
-
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks,
|
41
|
-
is_training, features, num_train_steps):
|
42
|
-
# Create a shared transformer encoder
|
43
|
-
bert_config = training_utils.get_bert_config(config)
|
44
|
-
self.bert_config = bert_config
|
45
|
-
if config.debug:
|
46
|
-
bert_config.num_hidden_layers = 3
|
47
|
-
bert_config.hidden_size = 144
|
48
|
-
bert_config.intermediate_size = 144 * 4
|
49
|
-
bert_config.num_attention_heads = 4
|
50
|
-
assert config.max_seq_length <= bert_config.max_position_embeddings
|
51
|
-
bert_model = modeling.BertModel(
|
52
|
-
bert_config=bert_config,
|
53
|
-
is_training=is_training,
|
54
|
-
input_ids=features["input_ids"],
|
55
|
-
input_mask=features["input_mask"],
|
56
|
-
token_type_ids=features["segment_ids"],
|
57
|
-
use_one_hot_embeddings=config.use_tpu,
|
58
|
-
embedding_size=config.embedding_size)
|
59
|
-
percent_done = (tf.cast(tf.train.get_or_create_global_step(), tf.float32) /
|
60
|
-
tf.cast(num_train_steps, tf.float32))
|
61
|
-
|
62
|
-
# Add specific tasks
|
63
|
-
self.outputs = {"task_id": features["task_id"]}
|
64
|
-
losses = []
|
65
|
-
for task in tasks:
|
66
|
-
with tf.variable_scope("task_specific/" + task.name):
|
67
|
-
task_losses, task_outputs = task.get_prediction_module(
|
68
|
-
bert_model, features, is_training, percent_done)
|
69
|
-
losses.append(task_losses)
|
70
|
-
self.outputs[task.name] = task_outputs
|
71
|
-
self.loss = tf.reduce_sum(
|
72
|
-
tf.stack(losses, -1) *
|
73
|
-
tf.one_hot(features["task_id"], len(config.task_names)))
|
74
|
-
|
75
|
-
|
76
|
-
def model_fn_builder(config: configure_finetuning.FinetuningConfig, tasks,
|
77
|
-
num_train_steps, pretraining_config=None):
|
78
|
-
"""Returns `model_fn` closure for TPUEstimator."""
|
79
|
-
|
80
|
-
def model_fn(features, labels, mode, params):
|
81
|
-
"""The `model_fn` for TPUEstimator."""
|
82
|
-
utils.log("Building model...")
|
83
|
-
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
|
84
|
-
model = FinetuningModel(
|
85
|
-
config, tasks, is_training, features, num_train_steps)
|
86
|
-
|
87
|
-
# Load pre-trained weights from checkpoint
|
88
|
-
init_checkpoint = config.init_checkpoint
|
89
|
-
if pretraining_config is not None:
|
90
|
-
init_checkpoint = tf.train.latest_checkpoint(pretraining_config.model_dir)
|
91
|
-
utils.log("Using checkpoint", init_checkpoint)
|
92
|
-
tvars = tf.trainable_variables()
|
93
|
-
scaffold_fn = None
|
94
|
-
if init_checkpoint:
|
95
|
-
assignment_map, _ = modeling.get_assignment_map_from_checkpoint(
|
96
|
-
tvars, init_checkpoint)
|
97
|
-
if config.use_tpu:
|
98
|
-
def tpu_scaffold():
|
99
|
-
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
100
|
-
return tf.train.Scaffold()
|
101
|
-
scaffold_fn = tpu_scaffold
|
102
|
-
else:
|
103
|
-
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
104
|
-
|
105
|
-
# Build model for training or prediction
|
106
|
-
if mode == tf.estimator.ModeKeys.TRAIN:
|
107
|
-
train_op = optimization.create_optimizer(
|
108
|
-
model.loss, config.learning_rate, num_train_steps,
|
109
|
-
weight_decay_rate=config.weight_decay_rate,
|
110
|
-
use_tpu=config.use_tpu,
|
111
|
-
warmup_proportion=config.warmup_proportion,
|
112
|
-
layerwise_lr_decay_power=config.layerwise_lr_decay,
|
113
|
-
n_transformer_layers=model.bert_config.num_hidden_layers
|
114
|
-
)
|
115
|
-
output_spec = tf.estimator.tpu.TPUEstimatorSpec(
|
116
|
-
mode=mode,
|
117
|
-
loss=model.loss,
|
118
|
-
train_op=train_op,
|
119
|
-
scaffold_fn=scaffold_fn,
|
120
|
-
training_hooks=[training_utils.ETAHook(
|
121
|
-
{} if config.use_tpu else dict(loss=model.loss),
|
122
|
-
num_train_steps, config.iterations_per_loop, config.use_tpu, 10)])
|
123
|
-
else:
|
124
|
-
assert mode == tf.estimator.ModeKeys.PREDICT
|
125
|
-
output_spec = tf.estimator.tpu.TPUEstimatorSpec(
|
126
|
-
mode=mode,
|
127
|
-
predictions=utils.flatten_dict(model.outputs),
|
128
|
-
scaffold_fn=scaffold_fn)
|
129
|
-
|
130
|
-
utils.log("Building complete")
|
131
|
-
return output_spec
|
132
|
-
|
133
|
-
return model_fn
|
134
|
-
|
135
|
-
|
136
|
-
class ModelRunner(object):
|
137
|
-
"""Fine-tunes a model on a supervised task."""
|
138
|
-
|
139
|
-
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks,
|
140
|
-
pretraining_config=None):
|
141
|
-
self._config = config
|
142
|
-
self._tasks = tasks
|
143
|
-
self._preprocessor = preprocessing.Preprocessor(config, self._tasks)
|
144
|
-
|
145
|
-
is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
|
146
|
-
tpu_cluster_resolver = None
|
147
|
-
if config.use_tpu and config.tpu_name:
|
148
|
-
tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
|
149
|
-
config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
|
150
|
-
tpu_config = tf.estimator.tpu.TPUConfig(
|
151
|
-
iterations_per_loop=config.iterations_per_loop,
|
152
|
-
num_shards=config.num_tpu_cores,
|
153
|
-
per_host_input_for_training=is_per_host,
|
154
|
-
tpu_job_name=config.tpu_job_name)
|
155
|
-
run_config = tf.estimator.tpu.RunConfig(
|
156
|
-
cluster=tpu_cluster_resolver,
|
157
|
-
model_dir=config.model_dir,
|
158
|
-
save_checkpoints_steps=config.save_checkpoints_steps,
|
159
|
-
save_checkpoints_secs=None,
|
160
|
-
tpu_config=tpu_config)
|
161
|
-
|
162
|
-
if self._config.do_train:
|
163
|
-
(self._train_input_fn,
|
164
|
-
self.train_steps) = self._preprocessor.prepare_train()
|
165
|
-
else:
|
166
|
-
self._train_input_fn, self.train_steps = None, 0
|
167
|
-
model_fn = model_fn_builder(
|
168
|
-
config=config,
|
169
|
-
tasks=self._tasks,
|
170
|
-
num_train_steps=self.train_steps,
|
171
|
-
pretraining_config=pretraining_config)
|
172
|
-
self._estimator = tf.estimator.tpu.TPUEstimator(
|
173
|
-
use_tpu=config.use_tpu,
|
174
|
-
model_fn=model_fn,
|
175
|
-
config=run_config,
|
176
|
-
train_batch_size=config.train_batch_size,
|
177
|
-
eval_batch_size=config.eval_batch_size,
|
178
|
-
predict_batch_size=config.predict_batch_size)
|
179
|
-
|
180
|
-
def train(self):
|
181
|
-
utils.log("Training for {:} steps".format(self.train_steps))
|
182
|
-
self._estimator.train(
|
183
|
-
input_fn=self._train_input_fn, max_steps=self.train_steps)
|
184
|
-
|
185
|
-
def evaluate(self):
|
186
|
-
return {task.name: self.evaluate_task(task) for task in self._tasks}
|
187
|
-
|
188
|
-
def evaluate_task(self, task, split="dev", return_results=True):
|
189
|
-
"""Evaluate the current model."""
|
190
|
-
utils.log("Evaluating", task.name)
|
191
|
-
eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
|
192
|
-
results = self._estimator.predict(input_fn=eval_input_fn,
|
193
|
-
yield_single_examples=True)
|
194
|
-
scorer = task.get_scorer()
|
195
|
-
for r in results:
|
196
|
-
if r["task_id"] != len(self._tasks): # ignore padding examples
|
197
|
-
r = utils.nest_dict(r, self._config.task_names)
|
198
|
-
scorer.update(r[task.name])
|
199
|
-
if return_results:
|
200
|
-
utils.log(task.name + ": " + scorer.results_str())
|
201
|
-
utils.log()
|
202
|
-
return dict(scorer.get_results())
|
203
|
-
else:
|
204
|
-
return scorer
|
205
|
-
|
206
|
-
def write_classification_outputs(self, tasks, trial, split):
|
207
|
-
"""Write classification predictions to disk."""
|
208
|
-
utils.log("Writing out predictions for", tasks, split)
|
209
|
-
predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
|
210
|
-
results = self._estimator.predict(input_fn=predict_input_fn,
|
211
|
-
yield_single_examples=True)
|
212
|
-
# task name -> eid -> model-logits
|
213
|
-
logits = collections.defaultdict(dict)
|
214
|
-
for r in results:
|
215
|
-
if r["task_id"] != len(self._tasks):
|
216
|
-
r = utils.nest_dict(r, self._config.task_names)
|
217
|
-
task_name = self._config.task_names[r["task_id"]]
|
218
|
-
logits[task_name][r[task_name]["eid"]] = (
|
219
|
-
r[task_name]["logits"] if "logits" in r[task_name]
|
220
|
-
else r[task_name]["predictions"])
|
221
|
-
for task_name in logits:
|
222
|
-
utils.log("Pickling predictions for {:} {:} examples ({:})".format(
|
223
|
-
len(logits[task_name]), task_name, split))
|
224
|
-
if trial <= self._config.n_writes_test:
|
225
|
-
utils.write_pickle(logits[task_name], self._config.test_predictions(
|
226
|
-
task_name, split, trial))
|
227
|
-
|
228
|
-
|
229
|
-
def write_results(config: configure_finetuning.FinetuningConfig, results):
|
230
|
-
"""Write evaluation metrics to disk."""
|
231
|
-
utils.log("Writing results to", config.results_txt)
|
232
|
-
utils.mkdir(config.results_txt.rsplit("/", 1)[0])
|
233
|
-
utils.write_pickle(results, config.results_pkl)
|
234
|
-
with tf.io.gfile.GFile(config.results_txt, "w") as f:
|
235
|
-
results_str = ""
|
236
|
-
for trial_results in results:
|
237
|
-
for task_name, task_results in trial_results.items():
|
238
|
-
if task_name == "time" or task_name == "global_step":
|
239
|
-
continue
|
240
|
-
results_str += task_name + ": " + " - ".join(
|
241
|
-
["{:}: {:.2f}".format(k, v)
|
242
|
-
for k, v in task_results.items()]) + "\n"
|
243
|
-
f.write(results_str)
|
244
|
-
utils.write_pickle(results, config.results_pkl)
|
245
|
-
|
246
|
-
|
247
|
-
def run_finetuning(config: configure_finetuning.FinetuningConfig):
|
248
|
-
"""Run finetuning."""
|
249
|
-
|
250
|
-
# Setup for training
|
251
|
-
results = []
|
252
|
-
trial = 1
|
253
|
-
heading_info = "model={:}, trial {:}/{:}".format(
|
254
|
-
config.model_name, trial, config.num_trials)
|
255
|
-
heading = lambda msg: utils.heading(msg + ": " + heading_info)
|
256
|
-
heading("Config")
|
257
|
-
utils.log_config(config)
|
258
|
-
generic_model_dir = config.model_dir
|
259
|
-
tasks = task_builder.get_tasks(config)
|
260
|
-
|
261
|
-
# Train and evaluate num_trials models with different random seeds
|
262
|
-
while config.num_trials < 0 or trial <= config.num_trials:
|
263
|
-
config.model_dir = generic_model_dir + "_" + str(trial)
|
264
|
-
if config.do_train:
|
265
|
-
utils.rmkdir(config.model_dir)
|
266
|
-
|
267
|
-
model_runner = ModelRunner(config, tasks)
|
268
|
-
if config.do_train:
|
269
|
-
heading("Start training")
|
270
|
-
model_runner.train()
|
271
|
-
utils.log()
|
272
|
-
|
273
|
-
if config.do_eval:
|
274
|
-
heading("Run dev set evaluation")
|
275
|
-
results.append(model_runner.evaluate())
|
276
|
-
write_results(config, results)
|
277
|
-
if config.write_test_outputs and trial <= config.n_writes_test:
|
278
|
-
heading("Running on the test set and writing the predictions")
|
279
|
-
for task in tasks:
|
280
|
-
# Currently only writing preds for GLUE and SQuAD 2.0 is supported
|
281
|
-
if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
|
282
|
-
"sts"]:
|
283
|
-
for split in task.get_test_splits():
|
284
|
-
model_runner.write_classification_outputs([task], trial, split)
|
285
|
-
elif task.name == "squad":
|
286
|
-
scorer = model_runner.evaluate_task(task, "test", False)
|
287
|
-
scorer.write_predictions()
|
288
|
-
preds = utils.load_json(config.qa_preds_file("squad"))
|
289
|
-
null_odds = utils.load_json(config.qa_na_file("squad"))
|
290
|
-
for q, _ in preds.items():
|
291
|
-
if null_odds[q] > config.qa_na_threshold:
|
292
|
-
preds[q] = ""
|
293
|
-
utils.write_json(preds, config.test_predictions(
|
294
|
-
task.name, "test", trial))
|
295
|
-
else:
|
296
|
-
utils.log("Skipping task", task.name,
|
297
|
-
"- writing predictions is not supported for this task")
|
298
|
-
|
299
|
-
if trial != config.num_trials and (not config.keep_all_models):
|
300
|
-
utils.rmrf(config.model_dir)
|
301
|
-
trial += 1
|
302
|
-
|
303
|
-
|
304
|
-
def main():
|
305
|
-
parser = argparse.ArgumentParser(description=__doc__)
|
306
|
-
parser.add_argument("--data-dir", required=True,
|
307
|
-
help="Location of data files (model weights, etc).")
|
308
|
-
parser.add_argument("--model-name", required=True,
|
309
|
-
help="The name of the model being fine-tuned.")
|
310
|
-
parser.add_argument("--hparams", default="{}",
|
311
|
-
help="JSON dict of model hyperparameters.")
|
312
|
-
args = parser.parse_args()
|
313
|
-
if args.hparams.endswith(".json"):
|
314
|
-
hparams = utils.load_json(args.hparams)
|
315
|
-
else:
|
316
|
-
hparams = json.loads(args.hparams)
|
317
|
-
tf.logging.set_verbosity(tf.logging.ERROR)
|
318
|
-
run_finetuning(configure_finetuning.FinetuningConfig(
|
319
|
-
args.model_name, args.data_dir, **hparams))
|
320
|
-
|
321
|
-
|
322
|
-
if __name__ == "__main__":
|
323
|
-
main()
|