SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,56 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Defines the inputs used when fine-tuning a model."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import numpy as np
|
23
|
-
import tensorflow as tf
|
24
|
-
|
25
|
-
import configure_finetuning
|
26
|
-
|
27
|
-
|
28
|
-
def get_shared_feature_specs(config: configure_finetuning.FinetuningConfig):
|
29
|
-
"""Non-task-specific model inputs."""
|
30
|
-
return [
|
31
|
-
FeatureSpec("input_ids", [config.max_seq_length]),
|
32
|
-
FeatureSpec("input_mask", [config.max_seq_length]),
|
33
|
-
FeatureSpec("segment_ids", [config.max_seq_length]),
|
34
|
-
FeatureSpec("task_id", []),
|
35
|
-
]
|
36
|
-
|
37
|
-
|
38
|
-
class FeatureSpec(object):
|
39
|
-
"""Defines a feature passed as input to the model."""
|
40
|
-
|
41
|
-
def __init__(self, name, shape, default_value_fn=None, is_int_feature=True):
|
42
|
-
self.name = name
|
43
|
-
self.shape = shape
|
44
|
-
self.default_value_fn = default_value_fn
|
45
|
-
self.is_int_feature = is_int_feature
|
46
|
-
|
47
|
-
def get_parsing_spec(self):
|
48
|
-
return tf.io.FixedLenFeature(
|
49
|
-
self.shape, tf.int64 if self.is_int_feature else tf.float32)
|
50
|
-
|
51
|
-
def get_default_values(self):
|
52
|
-
if self.default_value_fn:
|
53
|
-
return self.default_value_fn(self.shape)
|
54
|
-
else:
|
55
|
-
return np.zeros(
|
56
|
-
self.shape, np.int64 if self.is_int_feature else np.float32)
|
@@ -1,173 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Code for serializing raw fine-tuning data into tfrecords"""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import collections
|
23
|
-
import os
|
24
|
-
import random
|
25
|
-
import numpy as np
|
26
|
-
import tensorflow as tf
|
27
|
-
|
28
|
-
import configure_finetuning
|
29
|
-
from finetune import feature_spec
|
30
|
-
from util import utils
|
31
|
-
|
32
|
-
|
33
|
-
class Preprocessor(object):
|
34
|
-
"""Class for loading, preprocessing, and serializing fine-tuning datasets."""
|
35
|
-
|
36
|
-
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks):
|
37
|
-
self._config = config
|
38
|
-
self._tasks = tasks
|
39
|
-
self._name_to_task = {task.name: task for task in tasks}
|
40
|
-
|
41
|
-
self._feature_specs = feature_spec.get_shared_feature_specs(config)
|
42
|
-
for task in tasks:
|
43
|
-
self._feature_specs += task.get_feature_specs()
|
44
|
-
self._name_to_feature_config = {
|
45
|
-
spec.name: spec.get_parsing_spec()
|
46
|
-
for spec in self._feature_specs
|
47
|
-
}
|
48
|
-
assert len(self._name_to_feature_config) == len(self._feature_specs)
|
49
|
-
|
50
|
-
def prepare_train(self):
|
51
|
-
return self._serialize_dataset(self._tasks, True, "train")
|
52
|
-
|
53
|
-
def prepare_predict(self, tasks, split):
|
54
|
-
return self._serialize_dataset(tasks, False, split)
|
55
|
-
|
56
|
-
def _serialize_dataset(self, tasks, is_training, split):
|
57
|
-
"""Write out the dataset as tfrecords."""
|
58
|
-
dataset_name = "_".join(sorted([task.name for task in tasks]))
|
59
|
-
dataset_name += "_" + split
|
60
|
-
dataset_prefix = os.path.join(
|
61
|
-
self._config.preprocessed_data_dir, dataset_name)
|
62
|
-
tfrecords_path = dataset_prefix + ".tfrecord"
|
63
|
-
metadata_path = dataset_prefix + ".metadata"
|
64
|
-
batch_size = (self._config.train_batch_size if is_training else
|
65
|
-
self._config.eval_batch_size)
|
66
|
-
|
67
|
-
utils.log("Loading dataset", dataset_name)
|
68
|
-
n_examples = None
|
69
|
-
if (self._config.use_tfrecords_if_existing and
|
70
|
-
tf.io.gfile.exists(metadata_path)):
|
71
|
-
n_examples = utils.load_json(metadata_path)["n_examples"]
|
72
|
-
|
73
|
-
if n_examples is None:
|
74
|
-
utils.log("Existing tfrecords not found so creating")
|
75
|
-
examples = []
|
76
|
-
for task in tasks:
|
77
|
-
task_examples = task.get_examples(split)
|
78
|
-
examples += task_examples
|
79
|
-
if is_training:
|
80
|
-
random.shuffle(examples)
|
81
|
-
utils.mkdir(tfrecords_path.rsplit("/", 1)[0])
|
82
|
-
n_examples = self.serialize_examples(
|
83
|
-
examples, is_training, tfrecords_path, batch_size)
|
84
|
-
utils.write_json({"n_examples": n_examples}, metadata_path)
|
85
|
-
|
86
|
-
input_fn = self._input_fn_builder(tfrecords_path, is_training)
|
87
|
-
if is_training:
|
88
|
-
steps = int(n_examples // batch_size * self._config.num_train_epochs)
|
89
|
-
else:
|
90
|
-
steps = n_examples // batch_size
|
91
|
-
|
92
|
-
return input_fn, steps
|
93
|
-
|
94
|
-
def serialize_examples(self, examples, is_training, output_file, batch_size):
|
95
|
-
"""Convert a set of `InputExample`s to a TFRecord file."""
|
96
|
-
n_examples = 0
|
97
|
-
with tf.io.TFRecordWriter(output_file) as writer:
|
98
|
-
for (ex_index, example) in enumerate(examples):
|
99
|
-
if ex_index % 2000 == 0:
|
100
|
-
utils.log("Writing example {:} of {:}".format(
|
101
|
-
ex_index, len(examples)))
|
102
|
-
for tf_example in self._example_to_tf_example(
|
103
|
-
example, is_training,
|
104
|
-
log=self._config.log_examples and ex_index < 1):
|
105
|
-
writer.write(tf_example.SerializeToString())
|
106
|
-
n_examples += 1
|
107
|
-
# add padding so the dataset is a multiple of batch_size
|
108
|
-
while n_examples % batch_size != 0:
|
109
|
-
writer.write(self._make_tf_example(task_id=len(self._config.task_names))
|
110
|
-
.SerializeToString())
|
111
|
-
n_examples += 1
|
112
|
-
return n_examples
|
113
|
-
|
114
|
-
def _example_to_tf_example(self, example, is_training, log=False):
|
115
|
-
examples = self._name_to_task[example.task_name].featurize(
|
116
|
-
example, is_training, log)
|
117
|
-
if not isinstance(examples, list):
|
118
|
-
examples = [examples]
|
119
|
-
for example in examples:
|
120
|
-
yield self._make_tf_example(**example)
|
121
|
-
|
122
|
-
def _make_tf_example(self, **kwargs):
|
123
|
-
"""Make a tf.train.Example from the provided features."""
|
124
|
-
for k in kwargs:
|
125
|
-
if k not in self._name_to_feature_config:
|
126
|
-
raise ValueError("Unknown feature", k)
|
127
|
-
features = collections.OrderedDict()
|
128
|
-
for spec in self._feature_specs:
|
129
|
-
if spec.name in kwargs:
|
130
|
-
values = kwargs[spec.name]
|
131
|
-
else:
|
132
|
-
values = spec.get_default_values()
|
133
|
-
if (isinstance(values, int) or isinstance(values, bool) or
|
134
|
-
isinstance(values, float) or isinstance(values, np.float32) or
|
135
|
-
(isinstance(values, np.ndarray) and values.size == 1)):
|
136
|
-
values = [values]
|
137
|
-
if spec.is_int_feature:
|
138
|
-
feature = tf.train.Feature(int64_list=tf.train.Int64List(
|
139
|
-
value=list(values)))
|
140
|
-
else:
|
141
|
-
feature = tf.train.Feature(float_list=tf.train.FloatList(
|
142
|
-
value=list(values)))
|
143
|
-
features[spec.name] = feature
|
144
|
-
return tf.train.Example(features=tf.train.Features(feature=features))
|
145
|
-
|
146
|
-
def _input_fn_builder(self, input_file, is_training):
|
147
|
-
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
|
148
|
-
|
149
|
-
def input_fn(params):
|
150
|
-
"""The actual input function."""
|
151
|
-
d = tf.data.TFRecordDataset(input_file)
|
152
|
-
if is_training:
|
153
|
-
d = d.repeat()
|
154
|
-
d = d.shuffle(buffer_size=100)
|
155
|
-
return d.apply(
|
156
|
-
tf.data.experimental.map_and_batch(
|
157
|
-
self._decode_tfrecord,
|
158
|
-
batch_size=params["batch_size"],
|
159
|
-
drop_remainder=True))
|
160
|
-
|
161
|
-
return input_fn
|
162
|
-
|
163
|
-
def _decode_tfrecord(self, record):
|
164
|
-
"""Decodes a record to a TensorFlow example."""
|
165
|
-
example = tf.io.parse_single_example(record, self._name_to_feature_config)
|
166
|
-
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
|
167
|
-
# So cast all int64 to int32.
|
168
|
-
for name, tensor in example.items():
|
169
|
-
if tensor.dtype == tf.int64:
|
170
|
-
example[name] = tf.cast(tensor, tf.int32)
|
171
|
-
else:
|
172
|
-
example[name] = tensor
|
173
|
-
return example
|
@@ -1,54 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Base class for evaluation metrics."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import abc
|
23
|
-
|
24
|
-
|
25
|
-
class Scorer(object):
|
26
|
-
"""Abstract base class for computing evaluation metrics."""
|
27
|
-
|
28
|
-
__metaclass__ = abc.ABCMeta
|
29
|
-
|
30
|
-
def __init__(self):
|
31
|
-
self._updated = False
|
32
|
-
self._cached_results = {}
|
33
|
-
|
34
|
-
@abc.abstractmethod
|
35
|
-
def update(self, results):
|
36
|
-
self._updated = True
|
37
|
-
|
38
|
-
@abc.abstractmethod
|
39
|
-
def get_loss(self):
|
40
|
-
pass
|
41
|
-
|
42
|
-
@abc.abstractmethod
|
43
|
-
def _get_results(self):
|
44
|
-
return []
|
45
|
-
|
46
|
-
def get_results(self, prefix=""):
|
47
|
-
results = self._get_results() if self._updated else self._cached_results
|
48
|
-
self._cached_results = results
|
49
|
-
self._updated = False
|
50
|
-
return [(prefix + k, v) for k, v in results]
|
51
|
-
|
52
|
-
def results_str(self):
|
53
|
-
return " - ".join(["{:}: {:.2f}".format(k, v)
|
54
|
-
for k, v in self.get_results()])
|
@@ -1,74 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Defines a supervised NLP task."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import abc
|
23
|
-
from typing import List, Tuple
|
24
|
-
|
25
|
-
import configure_finetuning
|
26
|
-
from finetune import feature_spec
|
27
|
-
from finetune import scorer
|
28
|
-
from model import modeling
|
29
|
-
|
30
|
-
|
31
|
-
class Example(object):
|
32
|
-
__metaclass__ = abc.ABCMeta
|
33
|
-
|
34
|
-
def __init__(self, task_name):
|
35
|
-
self.task_name = task_name
|
36
|
-
|
37
|
-
|
38
|
-
class Task(object):
|
39
|
-
"""Override this class to add a new fine-tuning task."""
|
40
|
-
|
41
|
-
__metaclass__ = abc.ABCMeta
|
42
|
-
|
43
|
-
def __init__(self, config: configure_finetuning.FinetuningConfig, name):
|
44
|
-
self.config = config
|
45
|
-
self.name = name
|
46
|
-
|
47
|
-
def get_test_splits(self):
|
48
|
-
return ["test"]
|
49
|
-
|
50
|
-
@abc.abstractmethod
|
51
|
-
def get_examples(self, split):
|
52
|
-
pass
|
53
|
-
|
54
|
-
@abc.abstractmethod
|
55
|
-
def get_scorer(self) -> scorer.Scorer:
|
56
|
-
pass
|
57
|
-
|
58
|
-
@abc.abstractmethod
|
59
|
-
def get_feature_specs(self) -> List[feature_spec.FeatureSpec]:
|
60
|
-
pass
|
61
|
-
|
62
|
-
@abc.abstractmethod
|
63
|
-
def featurize(self, example: Example, is_training: bool,
|
64
|
-
log: bool=False):
|
65
|
-
pass
|
66
|
-
|
67
|
-
@abc.abstractmethod
|
68
|
-
def get_prediction_module(
|
69
|
-
self, bert_model: modeling.BertModel, features: dict, is_training: bool,
|
70
|
-
percent_done: float) -> Tuple:
|
71
|
-
pass
|
72
|
-
|
73
|
-
def __repr__(self):
|
74
|
-
return "Task(" + self.name + ")"
|
@@ -1,70 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Returns task instances given the task name."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import configure_finetuning
|
23
|
-
from finetune.classification import classification_tasks
|
24
|
-
from finetune.qa import qa_tasks
|
25
|
-
from finetune.tagging import tagging_tasks
|
26
|
-
from model import tokenization
|
27
|
-
|
28
|
-
|
29
|
-
def get_tasks(config: configure_finetuning.FinetuningConfig):
|
30
|
-
tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file,
|
31
|
-
do_lower_case=config.do_lower_case)
|
32
|
-
return [get_task(config, task_name, tokenizer)
|
33
|
-
for task_name in config.task_names]
|
34
|
-
|
35
|
-
|
36
|
-
def get_task(config: configure_finetuning.FinetuningConfig, task_name,
|
37
|
-
tokenizer):
|
38
|
-
"""Get an instance of a task based on its name."""
|
39
|
-
if task_name == "cola":
|
40
|
-
return classification_tasks.CoLA(config, tokenizer)
|
41
|
-
elif task_name == "mrpc":
|
42
|
-
return classification_tasks.MRPC(config, tokenizer)
|
43
|
-
elif task_name == "mnli":
|
44
|
-
return classification_tasks.MNLI(config, tokenizer)
|
45
|
-
elif task_name == "sst":
|
46
|
-
return classification_tasks.SST(config, tokenizer)
|
47
|
-
elif task_name == "rte":
|
48
|
-
return classification_tasks.RTE(config, tokenizer)
|
49
|
-
elif task_name == "qnli":
|
50
|
-
return classification_tasks.QNLI(config, tokenizer)
|
51
|
-
elif task_name == "qqp":
|
52
|
-
return classification_tasks.QQP(config, tokenizer)
|
53
|
-
elif task_name == "sts":
|
54
|
-
return classification_tasks.STS(config, tokenizer)
|
55
|
-
elif task_name == "squad":
|
56
|
-
return qa_tasks.SQuAD(config, tokenizer)
|
57
|
-
elif task_name == "squadv1":
|
58
|
-
return qa_tasks.SQuADv1(config, tokenizer)
|
59
|
-
elif task_name == "newsqa":
|
60
|
-
return qa_tasks.NewsQA(config, tokenizer)
|
61
|
-
elif task_name == "naturalqs":
|
62
|
-
return qa_tasks.NaturalQuestions(config, tokenizer)
|
63
|
-
elif task_name == "triviaqa":
|
64
|
-
return qa_tasks.TriviaQA(config, tokenizer)
|
65
|
-
elif task_name == "searchqa":
|
66
|
-
return qa_tasks.SearchQA(config, tokenizer)
|
67
|
-
elif task_name == "chunk":
|
68
|
-
return tagging_tasks.Chunking(config, tokenizer)
|
69
|
-
else:
|
70
|
-
raise ValueError("Unknown task " + task_name)
|
@@ -1,215 +0,0 @@
|
|
1
|
-
"""Computes the flops needed for training/running transformer networks."""
|
2
|
-
|
3
|
-
import collections
|
4
|
-
|
5
|
-
# We checked this code with TensorFlow"s FLOPs counting, although we had to
|
6
|
-
# correct for this issue: https://github.com/tensorflow/tensorflow/issues/22071
|
7
|
-
# Assumptions going into the FLOPs counting
|
8
|
-
# - An "operation" is a mathematical operation, not a machine instruction. So
|
9
|
-
# an "exp" takes one opp like and add, even though in practice an exp
|
10
|
-
# might be slower. This is not too bad an assumption because
|
11
|
-
# matrix-multiplies dominate the compute for most models, so minor details
|
12
|
-
# about activation functions don"t matter too much. Similarly, we count
|
13
|
-
# matrix-multiplies as 2*m*n flops instead of m*n, as one might if
|
14
|
-
# if considering fused multiply-add ops.
|
15
|
-
# - Backward pass takes the same number of FLOPs as forward pass. No exactly
|
16
|
-
# right (e.g., for softmax cross entropy loss the backward pass is faster).
|
17
|
-
# Importantly, it really is the same for matrix-multiplies, which is most of
|
18
|
-
# the compute anyway.
|
19
|
-
# - We assume "dense" embedding lookups (i.e., multiplication by a one-hot
|
20
|
-
# vector). On some hardware accelerators, these dense operations are
|
21
|
-
# actually faster than sparse lookups.
|
22
|
-
# Please open a github issue if you spot a problem with this code!
|
23
|
-
|
24
|
-
# I am not sure if the below constants are 100% right, but they are only applied
|
25
|
-
# to O(hidden_size) activations, which is generally a lot less compute than the
|
26
|
-
# matrix-multiplies, which are O(hidden_size^2), so they don't affect the total
|
27
|
-
# number of FLOPs much.
|
28
|
-
|
29
|
-
# random number, >=, multiply activations by dropout mask, multiply activations
|
30
|
-
# by correction (1 / (1 - dropout_rate))
|
31
|
-
DROPOUT_FLOPS = 4
|
32
|
-
|
33
|
-
# compute mean activation (sum), computate variance of activation
|
34
|
-
# (square and sum), bias (add), scale (multiply)
|
35
|
-
LAYER_NORM_FLOPS = 5
|
36
|
-
|
37
|
-
# GELU: 0.5 * x * (1 + tanh(sqrt(2 / np.pi) * (x + 0.044715 * pow(x, 3))))
|
38
|
-
ACTIVATION_FLOPS = 8
|
39
|
-
|
40
|
-
# max/substract (for stability), exp, sum, divide
|
41
|
-
SOFTMAX_FLOPS = 5
|
42
|
-
|
43
|
-
|
44
|
-
class TransformerHparams(object):
|
45
|
-
"""Computes the train/inference FLOPs for transformers."""
|
46
|
-
|
47
|
-
def __init__(self, h, l, s=512, v=30522, e=None, i=None, heads=None,
|
48
|
-
head_size=None, output_frac=0.15625, sparse_embed_lookup=False,
|
49
|
-
decoder=False):
|
50
|
-
self.h = h # hidden size
|
51
|
-
self.l = l # number of layers
|
52
|
-
self.s = s # sequence length
|
53
|
-
self.v = v # vocab size
|
54
|
-
self.e = h if e is None else e # embedding size
|
55
|
-
self.i = h * 4 if i is None else i # intermediate size
|
56
|
-
self.kqv = h if head_size is None else head_size * heads # attn proj sizes
|
57
|
-
self.heads = max(h // 64, 1) if heads is None else heads # attention heads
|
58
|
-
self.output_frac = output_frac # percent of tokens using an output softmax
|
59
|
-
self.sparse_embed_lookup = sparse_embed_lookup # sparse embedding lookups
|
60
|
-
self.decoder = decoder # decoder has extra attn to encoder states
|
61
|
-
|
62
|
-
def get_block_flops(self):
|
63
|
-
"""Get the forward-pass FLOPs for a single transformer block."""
|
64
|
-
attn_mul = 2 if self.decoder else 1
|
65
|
-
block_flops = dict(
|
66
|
-
kqv=3 * 2 * self.h * self.kqv * attn_mul,
|
67
|
-
kqv_bias=3 * self.kqv * attn_mul,
|
68
|
-
attention_scores=2 * self.kqv * self.s * attn_mul,
|
69
|
-
attn_softmax=SOFTMAX_FLOPS * self.s * self.heads * attn_mul,
|
70
|
-
attention_dropout=DROPOUT_FLOPS * self.s * self.heads * attn_mul,
|
71
|
-
attention_scale=self.s * self.heads * attn_mul,
|
72
|
-
attention_weighted_avg_values=2 * self.h * self.s * attn_mul,
|
73
|
-
attn_output=2 * self.h * self.h * attn_mul,
|
74
|
-
attn_output_bias=self.h * attn_mul,
|
75
|
-
attn_output_dropout=DROPOUT_FLOPS * self.h * attn_mul,
|
76
|
-
attn_output_residual=self.h * attn_mul,
|
77
|
-
attn_output_layer_norm=LAYER_NORM_FLOPS * attn_mul,
|
78
|
-
intermediate=2 * self.h * self.i,
|
79
|
-
intermediate_act=ACTIVATION_FLOPS * self.i,
|
80
|
-
intermediate_bias=self.i,
|
81
|
-
output=2 * self.h * self.i,
|
82
|
-
output_bias=self.h,
|
83
|
-
output_dropout=DROPOUT_FLOPS * self.h,
|
84
|
-
output_residual=self.h,
|
85
|
-
output_layer_norm=LAYER_NORM_FLOPS * self.h,
|
86
|
-
)
|
87
|
-
return sum(block_flops.values()) * self.s
|
88
|
-
|
89
|
-
def get_embedding_flops(self, output=False):
|
90
|
-
"""Get the forward-pass FLOPs the transformer inputs or output softmax."""
|
91
|
-
embedding_flops = {}
|
92
|
-
if output or (not self.sparse_embed_lookup):
|
93
|
-
embedding_flops["main_multiply"] = 2 * self.e * self.v
|
94
|
-
# input embedding post-processing
|
95
|
-
if not output:
|
96
|
-
embedding_flops.update(dict(
|
97
|
-
tok_type_and_position=2 * self.e * (self.s + 2),
|
98
|
-
add_tok_type_and_position=2 * self.e,
|
99
|
-
emb_layer_norm=LAYER_NORM_FLOPS * self.e,
|
100
|
-
emb_dropout=DROPOUT_FLOPS * self.e
|
101
|
-
))
|
102
|
-
# projection layer if e != h
|
103
|
-
if self.e != self.h or output:
|
104
|
-
embedding_flops.update(dict(
|
105
|
-
hidden_kernel=2 * self.h * self.e,
|
106
|
-
hidden_bias=self.e if output else self.h
|
107
|
-
))
|
108
|
-
# extra hidden layer and output softmax
|
109
|
-
if output:
|
110
|
-
embedding_flops.update(dict(
|
111
|
-
hidden_activation=ACTIVATION_FLOPS * self.e,
|
112
|
-
hidden_layernorm=LAYER_NORM_FLOPS * self.e,
|
113
|
-
output_softmax=SOFTMAX_FLOPS * self.v,
|
114
|
-
output_target_word=2 * self.v
|
115
|
-
))
|
116
|
-
return self.output_frac * sum(embedding_flops.values()) * self.s
|
117
|
-
return sum(embedding_flops.values()) * self.s
|
118
|
-
|
119
|
-
def get_binary_classification_flops(self):
|
120
|
-
classification_flops = dict(
|
121
|
-
hidden=2 * self.h * self.h,
|
122
|
-
hidden_bias=self.h,
|
123
|
-
hidden_act=ACTIVATION_FLOPS * self.h,
|
124
|
-
logits=2 * self.h
|
125
|
-
)
|
126
|
-
return sum(classification_flops.values()) * self.s
|
127
|
-
|
128
|
-
def get_train_flops(self, batch_size, train_steps, discriminator=False):
|
129
|
-
"""Get the FLOPs for pre-training the transformer."""
|
130
|
-
# 2* for forward/backward pass
|
131
|
-
return 2 * batch_size * train_steps * (
|
132
|
-
(self.l * self.get_block_flops()) +
|
133
|
-
self.get_embedding_flops(output=False) +
|
134
|
-
(self.get_binary_classification_flops() if discriminator else
|
135
|
-
self.get_embedding_flops(output=True))
|
136
|
-
)
|
137
|
-
|
138
|
-
def get_infer_flops(self):
|
139
|
-
"""Get the FLOPs for running inference with the transformer on a
|
140
|
-
classification task."""
|
141
|
-
return ((self.l * self.get_block_flops()) +
|
142
|
-
self.get_embedding_flops(output=False) +
|
143
|
-
self.get_binary_classification_flops())
|
144
|
-
|
145
|
-
|
146
|
-
def get_electra_train_flops(
|
147
|
-
h_d, l_d, h_g, l_g, batch_size, train_steps, tied_embeddings,
|
148
|
-
e=None, s=512, output_frac=0.15625):
|
149
|
-
"""Get the FLOPs needed for pre-training ELECTRA."""
|
150
|
-
if e is None:
|
151
|
-
e = h_d
|
152
|
-
disc = TransformerHparams(
|
153
|
-
h_d, l_d, s=s, e=e,
|
154
|
-
output_frac=output_frac).get_train_flops(batch_size, train_steps, True)
|
155
|
-
gen = TransformerHparams(
|
156
|
-
h_g, l_g, s=s, e=e if tied_embeddings else None,
|
157
|
-
output_frac=output_frac).get_train_flops(batch_size, train_steps)
|
158
|
-
return disc + gen
|
159
|
-
|
160
|
-
|
161
|
-
MODEL_FLOPS = collections.OrderedDict([
|
162
|
-
# These runtimes were computed with tensorflow FLOPs counting instead of the
|
163
|
-
# script, as the neural architectures are quite different.
|
164
|
-
# 768648884 words in LM1b benchmark, 10 epochs with batch size 20,
|
165
|
-
# seq length 128, 568093262680 FLOPs per example.
|
166
|
-
("elmo", 2 * 10 * 768648884 * 568093262680 / (20.0 * 128)),
|
167
|
-
# 15064773691518 is FLOPs for forward pass on 32 examples.
|
168
|
-
# Therefore 2 * steps * batch_size * 15064773691518 / 32 is XLNet compute
|
169
|
-
("xlnet", 2 * 500000 * 8192 * 15064773691518 / 32.0),
|
170
|
-
|
171
|
-
# Runtimes computed with the script
|
172
|
-
("gpt", TransformerHparams(768, 12, v=40000, output_frac=1.0).get_train_flops(
|
173
|
-
128, 960800)),
|
174
|
-
("bert_small", TransformerHparams(256, 12, e=128, s=128).get_train_flops(128, 1.45e6)),
|
175
|
-
("bert_base", TransformerHparams(768, 12).get_train_flops(256, 1e6)),
|
176
|
-
("bert_large", TransformerHparams(1024, 24).get_train_flops(256, 1e6)),
|
177
|
-
("electra_small", get_electra_train_flops(256, 12, 64, 12, 128, 1e6, True, s=128, e=128)),
|
178
|
-
("electra_base", get_electra_train_flops(768, 12, 256, 12, 256, 766000, True)),
|
179
|
-
("electra_400k", get_electra_train_flops(1024, 24, 256, 24, 2048, 400000, True)),
|
180
|
-
("electra_1.75M", get_electra_train_flops(1024, 24, 256, 24, 2048, 1750000, True)),
|
181
|
-
|
182
|
-
# RoBERTa, ALBERT, and T5 have minor architectural differences from
|
183
|
-
# BERT/ELECTRA, but I believe they don't significantly effect the runtime,
|
184
|
-
# so we use this script for those models as well.
|
185
|
-
("roberta", TransformerHparams(1024, 24, v=50265).get_train_flops(8000, 500000)),
|
186
|
-
("albert", TransformerHparams(4096, 12, v=30000, e=128).get_train_flops(
|
187
|
-
4096, 1.5e6)),
|
188
|
-
("t5_11b", TransformerHparams(
|
189
|
-
1024, # hidden size
|
190
|
-
24, # layers
|
191
|
-
v=32000, # vocab size
|
192
|
-
i=65536, # ff intermediate hidden size
|
193
|
-
heads=128, head_size=128, # heads/head size
|
194
|
-
output_frac=0.0 # encoder has no output softmax
|
195
|
-
).get_train_flops(2048, 1e6) + # 1M steps with batch size 2048
|
196
|
-
TransformerHparams(
|
197
|
-
1024,
|
198
|
-
24,
|
199
|
-
v=32000,
|
200
|
-
i=65536,
|
201
|
-
heads=128, head_size=128,
|
202
|
-
output_frac=1.0, # decoder has output softmax for all positions
|
203
|
-
decoder=True
|
204
|
-
).get_train_flops(2048, 1e6))
|
205
|
-
])
|
206
|
-
|
207
|
-
|
208
|
-
def main():
|
209
|
-
for k, v in MODEL_FLOPS.items():
|
210
|
-
print(k, v)
|
211
|
-
|
212
|
-
|
213
|
-
if __name__ == "__main__":
|
214
|
-
main()
|
215
|
-
|
@@ -1,14 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|