SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,56 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Defines the inputs used when fine-tuning a model."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import numpy as np
23
- import tensorflow as tf
24
-
25
- import configure_finetuning
26
-
27
-
28
- def get_shared_feature_specs(config: configure_finetuning.FinetuningConfig):
29
- """Non-task-specific model inputs."""
30
- return [
31
- FeatureSpec("input_ids", [config.max_seq_length]),
32
- FeatureSpec("input_mask", [config.max_seq_length]),
33
- FeatureSpec("segment_ids", [config.max_seq_length]),
34
- FeatureSpec("task_id", []),
35
- ]
36
-
37
-
38
- class FeatureSpec(object):
39
- """Defines a feature passed as input to the model."""
40
-
41
- def __init__(self, name, shape, default_value_fn=None, is_int_feature=True):
42
- self.name = name
43
- self.shape = shape
44
- self.default_value_fn = default_value_fn
45
- self.is_int_feature = is_int_feature
46
-
47
- def get_parsing_spec(self):
48
- return tf.io.FixedLenFeature(
49
- self.shape, tf.int64 if self.is_int_feature else tf.float32)
50
-
51
- def get_default_values(self):
52
- if self.default_value_fn:
53
- return self.default_value_fn(self.shape)
54
- else:
55
- return np.zeros(
56
- self.shape, np.int64 if self.is_int_feature else np.float32)
@@ -1,173 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Code for serializing raw fine-tuning data into tfrecords"""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import collections
23
- import os
24
- import random
25
- import numpy as np
26
- import tensorflow as tf
27
-
28
- import configure_finetuning
29
- from finetune import feature_spec
30
- from util import utils
31
-
32
-
33
- class Preprocessor(object):
34
- """Class for loading, preprocessing, and serializing fine-tuning datasets."""
35
-
36
- def __init__(self, config: configure_finetuning.FinetuningConfig, tasks):
37
- self._config = config
38
- self._tasks = tasks
39
- self._name_to_task = {task.name: task for task in tasks}
40
-
41
- self._feature_specs = feature_spec.get_shared_feature_specs(config)
42
- for task in tasks:
43
- self._feature_specs += task.get_feature_specs()
44
- self._name_to_feature_config = {
45
- spec.name: spec.get_parsing_spec()
46
- for spec in self._feature_specs
47
- }
48
- assert len(self._name_to_feature_config) == len(self._feature_specs)
49
-
50
- def prepare_train(self):
51
- return self._serialize_dataset(self._tasks, True, "train")
52
-
53
- def prepare_predict(self, tasks, split):
54
- return self._serialize_dataset(tasks, False, split)
55
-
56
- def _serialize_dataset(self, tasks, is_training, split):
57
- """Write out the dataset as tfrecords."""
58
- dataset_name = "_".join(sorted([task.name for task in tasks]))
59
- dataset_name += "_" + split
60
- dataset_prefix = os.path.join(
61
- self._config.preprocessed_data_dir, dataset_name)
62
- tfrecords_path = dataset_prefix + ".tfrecord"
63
- metadata_path = dataset_prefix + ".metadata"
64
- batch_size = (self._config.train_batch_size if is_training else
65
- self._config.eval_batch_size)
66
-
67
- utils.log("Loading dataset", dataset_name)
68
- n_examples = None
69
- if (self._config.use_tfrecords_if_existing and
70
- tf.io.gfile.exists(metadata_path)):
71
- n_examples = utils.load_json(metadata_path)["n_examples"]
72
-
73
- if n_examples is None:
74
- utils.log("Existing tfrecords not found so creating")
75
- examples = []
76
- for task in tasks:
77
- task_examples = task.get_examples(split)
78
- examples += task_examples
79
- if is_training:
80
- random.shuffle(examples)
81
- utils.mkdir(tfrecords_path.rsplit("/", 1)[0])
82
- n_examples = self.serialize_examples(
83
- examples, is_training, tfrecords_path, batch_size)
84
- utils.write_json({"n_examples": n_examples}, metadata_path)
85
-
86
- input_fn = self._input_fn_builder(tfrecords_path, is_training)
87
- if is_training:
88
- steps = int(n_examples // batch_size * self._config.num_train_epochs)
89
- else:
90
- steps = n_examples // batch_size
91
-
92
- return input_fn, steps
93
-
94
- def serialize_examples(self, examples, is_training, output_file, batch_size):
95
- """Convert a set of `InputExample`s to a TFRecord file."""
96
- n_examples = 0
97
- with tf.io.TFRecordWriter(output_file) as writer:
98
- for (ex_index, example) in enumerate(examples):
99
- if ex_index % 2000 == 0:
100
- utils.log("Writing example {:} of {:}".format(
101
- ex_index, len(examples)))
102
- for tf_example in self._example_to_tf_example(
103
- example, is_training,
104
- log=self._config.log_examples and ex_index < 1):
105
- writer.write(tf_example.SerializeToString())
106
- n_examples += 1
107
- # add padding so the dataset is a multiple of batch_size
108
- while n_examples % batch_size != 0:
109
- writer.write(self._make_tf_example(task_id=len(self._config.task_names))
110
- .SerializeToString())
111
- n_examples += 1
112
- return n_examples
113
-
114
- def _example_to_tf_example(self, example, is_training, log=False):
115
- examples = self._name_to_task[example.task_name].featurize(
116
- example, is_training, log)
117
- if not isinstance(examples, list):
118
- examples = [examples]
119
- for example in examples:
120
- yield self._make_tf_example(**example)
121
-
122
- def _make_tf_example(self, **kwargs):
123
- """Make a tf.train.Example from the provided features."""
124
- for k in kwargs:
125
- if k not in self._name_to_feature_config:
126
- raise ValueError("Unknown feature", k)
127
- features = collections.OrderedDict()
128
- for spec in self._feature_specs:
129
- if spec.name in kwargs:
130
- values = kwargs[spec.name]
131
- else:
132
- values = spec.get_default_values()
133
- if (isinstance(values, int) or isinstance(values, bool) or
134
- isinstance(values, float) or isinstance(values, np.float32) or
135
- (isinstance(values, np.ndarray) and values.size == 1)):
136
- values = [values]
137
- if spec.is_int_feature:
138
- feature = tf.train.Feature(int64_list=tf.train.Int64List(
139
- value=list(values)))
140
- else:
141
- feature = tf.train.Feature(float_list=tf.train.FloatList(
142
- value=list(values)))
143
- features[spec.name] = feature
144
- return tf.train.Example(features=tf.train.Features(feature=features))
145
-
146
- def _input_fn_builder(self, input_file, is_training):
147
- """Creates an `input_fn` closure to be passed to TPUEstimator."""
148
-
149
- def input_fn(params):
150
- """The actual input function."""
151
- d = tf.data.TFRecordDataset(input_file)
152
- if is_training:
153
- d = d.repeat()
154
- d = d.shuffle(buffer_size=100)
155
- return d.apply(
156
- tf.data.experimental.map_and_batch(
157
- self._decode_tfrecord,
158
- batch_size=params["batch_size"],
159
- drop_remainder=True))
160
-
161
- return input_fn
162
-
163
- def _decode_tfrecord(self, record):
164
- """Decodes a record to a TensorFlow example."""
165
- example = tf.io.parse_single_example(record, self._name_to_feature_config)
166
- # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
167
- # So cast all int64 to int32.
168
- for name, tensor in example.items():
169
- if tensor.dtype == tf.int64:
170
- example[name] = tf.cast(tensor, tf.int32)
171
- else:
172
- example[name] = tensor
173
- return example
@@ -1,54 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Base class for evaluation metrics."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import abc
23
-
24
-
25
- class Scorer(object):
26
- """Abstract base class for computing evaluation metrics."""
27
-
28
- __metaclass__ = abc.ABCMeta
29
-
30
- def __init__(self):
31
- self._updated = False
32
- self._cached_results = {}
33
-
34
- @abc.abstractmethod
35
- def update(self, results):
36
- self._updated = True
37
-
38
- @abc.abstractmethod
39
- def get_loss(self):
40
- pass
41
-
42
- @abc.abstractmethod
43
- def _get_results(self):
44
- return []
45
-
46
- def get_results(self, prefix=""):
47
- results = self._get_results() if self._updated else self._cached_results
48
- self._cached_results = results
49
- self._updated = False
50
- return [(prefix + k, v) for k, v in results]
51
-
52
- def results_str(self):
53
- return " - ".join(["{:}: {:.2f}".format(k, v)
54
- for k, v in self.get_results()])
@@ -1,74 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Defines a supervised NLP task."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import abc
23
- from typing import List, Tuple
24
-
25
- import configure_finetuning
26
- from finetune import feature_spec
27
- from finetune import scorer
28
- from model import modeling
29
-
30
-
31
- class Example(object):
32
- __metaclass__ = abc.ABCMeta
33
-
34
- def __init__(self, task_name):
35
- self.task_name = task_name
36
-
37
-
38
- class Task(object):
39
- """Override this class to add a new fine-tuning task."""
40
-
41
- __metaclass__ = abc.ABCMeta
42
-
43
- def __init__(self, config: configure_finetuning.FinetuningConfig, name):
44
- self.config = config
45
- self.name = name
46
-
47
- def get_test_splits(self):
48
- return ["test"]
49
-
50
- @abc.abstractmethod
51
- def get_examples(self, split):
52
- pass
53
-
54
- @abc.abstractmethod
55
- def get_scorer(self) -> scorer.Scorer:
56
- pass
57
-
58
- @abc.abstractmethod
59
- def get_feature_specs(self) -> List[feature_spec.FeatureSpec]:
60
- pass
61
-
62
- @abc.abstractmethod
63
- def featurize(self, example: Example, is_training: bool,
64
- log: bool=False):
65
- pass
66
-
67
- @abc.abstractmethod
68
- def get_prediction_module(
69
- self, bert_model: modeling.BertModel, features: dict, is_training: bool,
70
- percent_done: float) -> Tuple:
71
- pass
72
-
73
- def __repr__(self):
74
- return "Task(" + self.name + ")"
@@ -1,70 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Returns task instances given the task name."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import configure_finetuning
23
- from finetune.classification import classification_tasks
24
- from finetune.qa import qa_tasks
25
- from finetune.tagging import tagging_tasks
26
- from model import tokenization
27
-
28
-
29
- def get_tasks(config: configure_finetuning.FinetuningConfig):
30
- tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file,
31
- do_lower_case=config.do_lower_case)
32
- return [get_task(config, task_name, tokenizer)
33
- for task_name in config.task_names]
34
-
35
-
36
- def get_task(config: configure_finetuning.FinetuningConfig, task_name,
37
- tokenizer):
38
- """Get an instance of a task based on its name."""
39
- if task_name == "cola":
40
- return classification_tasks.CoLA(config, tokenizer)
41
- elif task_name == "mrpc":
42
- return classification_tasks.MRPC(config, tokenizer)
43
- elif task_name == "mnli":
44
- return classification_tasks.MNLI(config, tokenizer)
45
- elif task_name == "sst":
46
- return classification_tasks.SST(config, tokenizer)
47
- elif task_name == "rte":
48
- return classification_tasks.RTE(config, tokenizer)
49
- elif task_name == "qnli":
50
- return classification_tasks.QNLI(config, tokenizer)
51
- elif task_name == "qqp":
52
- return classification_tasks.QQP(config, tokenizer)
53
- elif task_name == "sts":
54
- return classification_tasks.STS(config, tokenizer)
55
- elif task_name == "squad":
56
- return qa_tasks.SQuAD(config, tokenizer)
57
- elif task_name == "squadv1":
58
- return qa_tasks.SQuADv1(config, tokenizer)
59
- elif task_name == "newsqa":
60
- return qa_tasks.NewsQA(config, tokenizer)
61
- elif task_name == "naturalqs":
62
- return qa_tasks.NaturalQuestions(config, tokenizer)
63
- elif task_name == "triviaqa":
64
- return qa_tasks.TriviaQA(config, tokenizer)
65
- elif task_name == "searchqa":
66
- return qa_tasks.SearchQA(config, tokenizer)
67
- elif task_name == "chunk":
68
- return tagging_tasks.Chunking(config, tokenizer)
69
- else:
70
- raise ValueError("Unknown task " + task_name)
@@ -1,215 +0,0 @@
1
- """Computes the flops needed for training/running transformer networks."""
2
-
3
- import collections
4
-
5
- # We checked this code with TensorFlow"s FLOPs counting, although we had to
6
- # correct for this issue: https://github.com/tensorflow/tensorflow/issues/22071
7
- # Assumptions going into the FLOPs counting
8
- # - An "operation" is a mathematical operation, not a machine instruction. So
9
- # an "exp" takes one opp like and add, even though in practice an exp
10
- # might be slower. This is not too bad an assumption because
11
- # matrix-multiplies dominate the compute for most models, so minor details
12
- # about activation functions don"t matter too much. Similarly, we count
13
- # matrix-multiplies as 2*m*n flops instead of m*n, as one might if
14
- # if considering fused multiply-add ops.
15
- # - Backward pass takes the same number of FLOPs as forward pass. No exactly
16
- # right (e.g., for softmax cross entropy loss the backward pass is faster).
17
- # Importantly, it really is the same for matrix-multiplies, which is most of
18
- # the compute anyway.
19
- # - We assume "dense" embedding lookups (i.e., multiplication by a one-hot
20
- # vector). On some hardware accelerators, these dense operations are
21
- # actually faster than sparse lookups.
22
- # Please open a github issue if you spot a problem with this code!
23
-
24
- # I am not sure if the below constants are 100% right, but they are only applied
25
- # to O(hidden_size) activations, which is generally a lot less compute than the
26
- # matrix-multiplies, which are O(hidden_size^2), so they don't affect the total
27
- # number of FLOPs much.
28
-
29
- # random number, >=, multiply activations by dropout mask, multiply activations
30
- # by correction (1 / (1 - dropout_rate))
31
- DROPOUT_FLOPS = 4
32
-
33
- # compute mean activation (sum), computate variance of activation
34
- # (square and sum), bias (add), scale (multiply)
35
- LAYER_NORM_FLOPS = 5
36
-
37
- # GELU: 0.5 * x * (1 + tanh(sqrt(2 / np.pi) * (x + 0.044715 * pow(x, 3))))
38
- ACTIVATION_FLOPS = 8
39
-
40
- # max/substract (for stability), exp, sum, divide
41
- SOFTMAX_FLOPS = 5
42
-
43
-
44
- class TransformerHparams(object):
45
- """Computes the train/inference FLOPs for transformers."""
46
-
47
- def __init__(self, h, l, s=512, v=30522, e=None, i=None, heads=None,
48
- head_size=None, output_frac=0.15625, sparse_embed_lookup=False,
49
- decoder=False):
50
- self.h = h # hidden size
51
- self.l = l # number of layers
52
- self.s = s # sequence length
53
- self.v = v # vocab size
54
- self.e = h if e is None else e # embedding size
55
- self.i = h * 4 if i is None else i # intermediate size
56
- self.kqv = h if head_size is None else head_size * heads # attn proj sizes
57
- self.heads = max(h // 64, 1) if heads is None else heads # attention heads
58
- self.output_frac = output_frac # percent of tokens using an output softmax
59
- self.sparse_embed_lookup = sparse_embed_lookup # sparse embedding lookups
60
- self.decoder = decoder # decoder has extra attn to encoder states
61
-
62
- def get_block_flops(self):
63
- """Get the forward-pass FLOPs for a single transformer block."""
64
- attn_mul = 2 if self.decoder else 1
65
- block_flops = dict(
66
- kqv=3 * 2 * self.h * self.kqv * attn_mul,
67
- kqv_bias=3 * self.kqv * attn_mul,
68
- attention_scores=2 * self.kqv * self.s * attn_mul,
69
- attn_softmax=SOFTMAX_FLOPS * self.s * self.heads * attn_mul,
70
- attention_dropout=DROPOUT_FLOPS * self.s * self.heads * attn_mul,
71
- attention_scale=self.s * self.heads * attn_mul,
72
- attention_weighted_avg_values=2 * self.h * self.s * attn_mul,
73
- attn_output=2 * self.h * self.h * attn_mul,
74
- attn_output_bias=self.h * attn_mul,
75
- attn_output_dropout=DROPOUT_FLOPS * self.h * attn_mul,
76
- attn_output_residual=self.h * attn_mul,
77
- attn_output_layer_norm=LAYER_NORM_FLOPS * attn_mul,
78
- intermediate=2 * self.h * self.i,
79
- intermediate_act=ACTIVATION_FLOPS * self.i,
80
- intermediate_bias=self.i,
81
- output=2 * self.h * self.i,
82
- output_bias=self.h,
83
- output_dropout=DROPOUT_FLOPS * self.h,
84
- output_residual=self.h,
85
- output_layer_norm=LAYER_NORM_FLOPS * self.h,
86
- )
87
- return sum(block_flops.values()) * self.s
88
-
89
- def get_embedding_flops(self, output=False):
90
- """Get the forward-pass FLOPs the transformer inputs or output softmax."""
91
- embedding_flops = {}
92
- if output or (not self.sparse_embed_lookup):
93
- embedding_flops["main_multiply"] = 2 * self.e * self.v
94
- # input embedding post-processing
95
- if not output:
96
- embedding_flops.update(dict(
97
- tok_type_and_position=2 * self.e * (self.s + 2),
98
- add_tok_type_and_position=2 * self.e,
99
- emb_layer_norm=LAYER_NORM_FLOPS * self.e,
100
- emb_dropout=DROPOUT_FLOPS * self.e
101
- ))
102
- # projection layer if e != h
103
- if self.e != self.h or output:
104
- embedding_flops.update(dict(
105
- hidden_kernel=2 * self.h * self.e,
106
- hidden_bias=self.e if output else self.h
107
- ))
108
- # extra hidden layer and output softmax
109
- if output:
110
- embedding_flops.update(dict(
111
- hidden_activation=ACTIVATION_FLOPS * self.e,
112
- hidden_layernorm=LAYER_NORM_FLOPS * self.e,
113
- output_softmax=SOFTMAX_FLOPS * self.v,
114
- output_target_word=2 * self.v
115
- ))
116
- return self.output_frac * sum(embedding_flops.values()) * self.s
117
- return sum(embedding_flops.values()) * self.s
118
-
119
- def get_binary_classification_flops(self):
120
- classification_flops = dict(
121
- hidden=2 * self.h * self.h,
122
- hidden_bias=self.h,
123
- hidden_act=ACTIVATION_FLOPS * self.h,
124
- logits=2 * self.h
125
- )
126
- return sum(classification_flops.values()) * self.s
127
-
128
- def get_train_flops(self, batch_size, train_steps, discriminator=False):
129
- """Get the FLOPs for pre-training the transformer."""
130
- # 2* for forward/backward pass
131
- return 2 * batch_size * train_steps * (
132
- (self.l * self.get_block_flops()) +
133
- self.get_embedding_flops(output=False) +
134
- (self.get_binary_classification_flops() if discriminator else
135
- self.get_embedding_flops(output=True))
136
- )
137
-
138
- def get_infer_flops(self):
139
- """Get the FLOPs for running inference with the transformer on a
140
- classification task."""
141
- return ((self.l * self.get_block_flops()) +
142
- self.get_embedding_flops(output=False) +
143
- self.get_binary_classification_flops())
144
-
145
-
146
- def get_electra_train_flops(
147
- h_d, l_d, h_g, l_g, batch_size, train_steps, tied_embeddings,
148
- e=None, s=512, output_frac=0.15625):
149
- """Get the FLOPs needed for pre-training ELECTRA."""
150
- if e is None:
151
- e = h_d
152
- disc = TransformerHparams(
153
- h_d, l_d, s=s, e=e,
154
- output_frac=output_frac).get_train_flops(batch_size, train_steps, True)
155
- gen = TransformerHparams(
156
- h_g, l_g, s=s, e=e if tied_embeddings else None,
157
- output_frac=output_frac).get_train_flops(batch_size, train_steps)
158
- return disc + gen
159
-
160
-
161
- MODEL_FLOPS = collections.OrderedDict([
162
- # These runtimes were computed with tensorflow FLOPs counting instead of the
163
- # script, as the neural architectures are quite different.
164
- # 768648884 words in LM1b benchmark, 10 epochs with batch size 20,
165
- # seq length 128, 568093262680 FLOPs per example.
166
- ("elmo", 2 * 10 * 768648884 * 568093262680 / (20.0 * 128)),
167
- # 15064773691518 is FLOPs for forward pass on 32 examples.
168
- # Therefore 2 * steps * batch_size * 15064773691518 / 32 is XLNet compute
169
- ("xlnet", 2 * 500000 * 8192 * 15064773691518 / 32.0),
170
-
171
- # Runtimes computed with the script
172
- ("gpt", TransformerHparams(768, 12, v=40000, output_frac=1.0).get_train_flops(
173
- 128, 960800)),
174
- ("bert_small", TransformerHparams(256, 12, e=128, s=128).get_train_flops(128, 1.45e6)),
175
- ("bert_base", TransformerHparams(768, 12).get_train_flops(256, 1e6)),
176
- ("bert_large", TransformerHparams(1024, 24).get_train_flops(256, 1e6)),
177
- ("electra_small", get_electra_train_flops(256, 12, 64, 12, 128, 1e6, True, s=128, e=128)),
178
- ("electra_base", get_electra_train_flops(768, 12, 256, 12, 256, 766000, True)),
179
- ("electra_400k", get_electra_train_flops(1024, 24, 256, 24, 2048, 400000, True)),
180
- ("electra_1.75M", get_electra_train_flops(1024, 24, 256, 24, 2048, 1750000, True)),
181
-
182
- # RoBERTa, ALBERT, and T5 have minor architectural differences from
183
- # BERT/ELECTRA, but I believe they don't significantly effect the runtime,
184
- # so we use this script for those models as well.
185
- ("roberta", TransformerHparams(1024, 24, v=50265).get_train_flops(8000, 500000)),
186
- ("albert", TransformerHparams(4096, 12, v=30000, e=128).get_train_flops(
187
- 4096, 1.5e6)),
188
- ("t5_11b", TransformerHparams(
189
- 1024, # hidden size
190
- 24, # layers
191
- v=32000, # vocab size
192
- i=65536, # ff intermediate hidden size
193
- heads=128, head_size=128, # heads/head size
194
- output_frac=0.0 # encoder has no output softmax
195
- ).get_train_flops(2048, 1e6) + # 1M steps with batch size 2048
196
- TransformerHparams(
197
- 1024,
198
- 24,
199
- v=32000,
200
- i=65536,
201
- heads=128, head_size=128,
202
- output_frac=1.0, # decoder has output softmax for all positions
203
- decoder=True
204
- ).get_train_flops(2048, 1e6))
205
- ])
206
-
207
-
208
- def main():
209
- for k, v in MODEL_FLOPS.items():
210
- print(k, v)
211
-
212
-
213
- if __name__ == "__main__":
214
- main()
215
-
@@ -1,14 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.