SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,109 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""A collection of general utility functions."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import json
|
23
|
-
import pickle
|
24
|
-
import sys
|
25
|
-
|
26
|
-
import tensorflow as tf
|
27
|
-
|
28
|
-
|
29
|
-
def load_json(path):
|
30
|
-
with tf.io.gfile.GFile(path, "r") as f:
|
31
|
-
return json.load(f)
|
32
|
-
|
33
|
-
|
34
|
-
def write_json(o, path):
|
35
|
-
if "/" in path:
|
36
|
-
tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
|
37
|
-
with tf.io.gfile.GFile(path, "w") as f:
|
38
|
-
json.dump(o, f)
|
39
|
-
|
40
|
-
|
41
|
-
def load_pickle(path):
|
42
|
-
with tf.io.gfile.GFile(path, "rb") as f:
|
43
|
-
return pickle.load(f)
|
44
|
-
|
45
|
-
|
46
|
-
def write_pickle(o, path):
|
47
|
-
if "/" in path:
|
48
|
-
tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
|
49
|
-
with tf.io.gfile.GFile(path, "wb") as f:
|
50
|
-
pickle.dump(o, f, -1)
|
51
|
-
|
52
|
-
|
53
|
-
def mkdir(path):
|
54
|
-
if not tf.io.gfile.exists(path):
|
55
|
-
tf.io.gfile.makedirs(path)
|
56
|
-
|
57
|
-
|
58
|
-
def rmrf(path):
|
59
|
-
if tf.io.gfile.exists(path):
|
60
|
-
tf.io.gfile.rmtree(path)
|
61
|
-
|
62
|
-
|
63
|
-
def rmkdir(path):
|
64
|
-
rmrf(path)
|
65
|
-
mkdir(path)
|
66
|
-
|
67
|
-
|
68
|
-
def log(*args):
|
69
|
-
msg = " ".join(map(str, args))
|
70
|
-
sys.stdout.write(msg + "\n")
|
71
|
-
sys.stdout.flush()
|
72
|
-
|
73
|
-
|
74
|
-
def log_config(config):
|
75
|
-
for key, value in sorted(config.__dict__.items()):
|
76
|
-
log(key, value)
|
77
|
-
log()
|
78
|
-
|
79
|
-
|
80
|
-
def heading(*args):
|
81
|
-
log(80 * "=")
|
82
|
-
log(*args)
|
83
|
-
log(80 * "=")
|
84
|
-
|
85
|
-
|
86
|
-
def nest_dict(d, prefixes, delim="_"):
|
87
|
-
"""Go from {prefix_key: value} to {prefix: {key: value}}."""
|
88
|
-
nested = {}
|
89
|
-
for k, v in d.items():
|
90
|
-
for prefix in prefixes:
|
91
|
-
if k.startswith(prefix + delim):
|
92
|
-
if prefix not in nested:
|
93
|
-
nested[prefix] = {}
|
94
|
-
nested[prefix][k.split(delim, 1)[1]] = v
|
95
|
-
else:
|
96
|
-
nested[k] = v
|
97
|
-
return nested
|
98
|
-
|
99
|
-
|
100
|
-
def flatten_dict(d, delim="_"):
|
101
|
-
"""Go from {prefix: {key: value}} to {prefix_key: value}."""
|
102
|
-
flattened = {}
|
103
|
-
for k, v in d.items():
|
104
|
-
if isinstance(v, dict):
|
105
|
-
for k2, v2 in v.items():
|
106
|
-
flattened[k + delim + k2] = v2
|
107
|
-
else:
|
108
|
-
flattened[k] = v
|
109
|
-
return flattened
|
@@ -1,95 +0,0 @@
|
|
1
|
-
import collections
|
2
|
-
from transformers import GPT2TokenizerFast
|
3
|
-
import tensorflow as tf
|
4
|
-
|
5
|
-
import sys
|
6
|
-
sys.path.append("..")
|
7
|
-
from arabert.preprocess import preprocess
|
8
|
-
|
9
|
-
flags = tf.flags
|
10
|
-
|
11
|
-
FLAGS = flags.FLAGS
|
12
|
-
|
13
|
-
flags.DEFINE_string(
|
14
|
-
"input_file", None, "Input raw text file (or comma-separated list of files)."
|
15
|
-
)
|
16
|
-
|
17
|
-
flags.DEFINE_string(
|
18
|
-
"output_file", None, "Output TF example file (or comma-separated list of files)."
|
19
|
-
)
|
20
|
-
|
21
|
-
flags.DEFINE_string(
|
22
|
-
"tokenizer_dir", None, "The directory of a pretrained GPT2TokenizerFast"
|
23
|
-
)
|
24
|
-
|
25
|
-
flags.DEFINE_integer(
|
26
|
-
"max_len", 1024, "The vocabulary file that the BERT model was trained on."
|
27
|
-
)
|
28
|
-
|
29
|
-
flags.DEFINE_integer(
|
30
|
-
"num_examples_print", 0, "Number of examples to print"
|
31
|
-
)
|
32
|
-
|
33
|
-
|
34
|
-
def create_int_feature(values):
|
35
|
-
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
36
|
-
return feature
|
37
|
-
|
38
|
-
|
39
|
-
def main(_):
|
40
|
-
tf.logging.set_verbosity(tf.logging.INFO)
|
41
|
-
logger = tf.get_logger()
|
42
|
-
logger.propagate = False
|
43
|
-
|
44
|
-
input_files = []
|
45
|
-
for input_pattern in FLAGS.input_file.split(","):
|
46
|
-
input_files.extend(tf.gfile.Glob(input_pattern))
|
47
|
-
|
48
|
-
tf.logging.info("*** Reading from input files ***")
|
49
|
-
for input_file in input_files:
|
50
|
-
tf.logging.info(" %s", input_file)
|
51
|
-
|
52
|
-
gpt2_tok = GPT2TokenizerFast.from_pretrained(FLAGS.tokenizer_dir)
|
53
|
-
|
54
|
-
writer = tf.python_io.TFRecordWriter(FLAGS.output_file + ".tfrecord")
|
55
|
-
|
56
|
-
eos_id = gpt2_tok.eos_token_id
|
57
|
-
all_examples = []
|
58
|
-
for input_file in input_files:
|
59
|
-
queue = []
|
60
|
-
example = []
|
61
|
-
with tf.gfile.GFile(input_file, "r") as reader:
|
62
|
-
for line in reader.readlines():
|
63
|
-
if line == "\n":
|
64
|
-
queue.append(eos_id)
|
65
|
-
else:
|
66
|
-
line = line.replace("\n", " ")
|
67
|
-
line = preprocess(line,model='gpt2-base-arabic')
|
68
|
-
line = line.strip()
|
69
|
-
enc_line = gpt2_tok.encode(line)
|
70
|
-
queue.extend(enc_line)
|
71
|
-
if len(queue) > FLAGS.max_len +1:
|
72
|
-
example = [queue.pop(0) for _ in range(FLAGS.max_len +1)]
|
73
|
-
assert len(example) == FLAGS.max_len +1
|
74
|
-
all_examples.append(example)
|
75
|
-
|
76
|
-
|
77
|
-
for i, ex in enumerate(all_examples):
|
78
|
-
features = collections.OrderedDict()
|
79
|
-
features["input_ids"] = create_int_feature(ex)
|
80
|
-
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
|
81
|
-
writer.write(tf_example.SerializeToString())
|
82
|
-
if i < FLAGS.num_examples_print:
|
83
|
-
tf.logging.info("*** Example ***")
|
84
|
-
tf.logging.info("Length: %d" % len(ex))
|
85
|
-
tf.logging.info("Tokens: %s" % gpt2_tok.decode(ex))
|
86
|
-
tf.logging.info("ids: %s" % " ".join([str(x) for x in ex]))
|
87
|
-
|
88
|
-
tf.logging.info("Wrote %d total instances", len(all_examples))
|
89
|
-
|
90
|
-
|
91
|
-
if __name__ == "__main__":
|
92
|
-
flags.mark_flag_as_required("input_file")
|
93
|
-
flags.mark_flag_as_required("output_file")
|
94
|
-
flags.mark_flag_as_required("tokenizer_dir")
|
95
|
-
tf.app.run()
|
@@ -1,158 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2019 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
# Lint as: python2, python3
|
17
|
-
"""Functions and classes related to optimization (weight updates)."""
|
18
|
-
|
19
|
-
from __future__ import absolute_import
|
20
|
-
from __future__ import division
|
21
|
-
from __future__ import print_function
|
22
|
-
|
23
|
-
import re
|
24
|
-
import six
|
25
|
-
import tensorflow as tf
|
26
|
-
|
27
|
-
# pylint: disable=g-direct-tensorflow-import
|
28
|
-
from tensorflow.python.ops import array_ops
|
29
|
-
from tensorflow.python.ops import linalg_ops
|
30
|
-
from tensorflow.python.ops import math_ops
|
31
|
-
|
32
|
-
# pylint: enable=g-direct-tensorflow-import
|
33
|
-
|
34
|
-
|
35
|
-
class LAMBOptimizer(tf.train.Optimizer):
|
36
|
-
"""LAMB (Layer-wise Adaptive Moments optimizer for Batch training)."""
|
37
|
-
|
38
|
-
# A new optimizer that includes correct L2 weight decay, adaptive
|
39
|
-
# element-wise updating, and layer-wise justification. The LAMB optimizer
|
40
|
-
# was proposed by Yang You, Jing Li, Jonathan Hseu, Xiaodan Song,
|
41
|
-
# James Demmel, and Cho-Jui Hsieh in a paper titled as Reducing BERT
|
42
|
-
# Pre-Training Time from 3 Days to 76 Minutes (arxiv.org/abs/1904.00962)
|
43
|
-
|
44
|
-
def __init__(
|
45
|
-
self,
|
46
|
-
learning_rate,
|
47
|
-
weight_decay_rate=0.0,
|
48
|
-
beta_1=0.9,
|
49
|
-
beta_2=0.999,
|
50
|
-
epsilon=1e-6,
|
51
|
-
exclude_from_weight_decay=None,
|
52
|
-
exclude_from_layer_adaptation=None,
|
53
|
-
name="LAMBOptimizer",
|
54
|
-
):
|
55
|
-
"""Constructs a LAMBOptimizer."""
|
56
|
-
super(LAMBOptimizer, self).__init__(False, name)
|
57
|
-
|
58
|
-
self.learning_rate = learning_rate
|
59
|
-
self.weight_decay_rate = weight_decay_rate
|
60
|
-
self.beta_1 = beta_1
|
61
|
-
self.beta_2 = beta_2
|
62
|
-
self.epsilon = epsilon
|
63
|
-
self.exclude_from_weight_decay = exclude_from_weight_decay
|
64
|
-
# exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
|
65
|
-
# arg is None.
|
66
|
-
# TODO(jingli): validate if exclude_from_layer_adaptation is necessary.
|
67
|
-
if exclude_from_layer_adaptation:
|
68
|
-
self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
|
69
|
-
else:
|
70
|
-
self.exclude_from_layer_adaptation = exclude_from_weight_decay
|
71
|
-
|
72
|
-
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
|
73
|
-
"""See base class."""
|
74
|
-
assignments = []
|
75
|
-
for (grad, param) in grads_and_vars:
|
76
|
-
if grad is None or param is None:
|
77
|
-
continue
|
78
|
-
|
79
|
-
param_name = self._get_variable_name(param.name)
|
80
|
-
|
81
|
-
m = tf.get_variable(
|
82
|
-
name=six.ensure_str(param_name) + "/adam_m",
|
83
|
-
shape=param.shape.as_list(),
|
84
|
-
dtype=tf.float32,
|
85
|
-
trainable=False,
|
86
|
-
initializer=tf.zeros_initializer(),
|
87
|
-
)
|
88
|
-
v = tf.get_variable(
|
89
|
-
name=six.ensure_str(param_name) + "/adam_v",
|
90
|
-
shape=param.shape.as_list(),
|
91
|
-
dtype=tf.float32,
|
92
|
-
trainable=False,
|
93
|
-
initializer=tf.zeros_initializer(),
|
94
|
-
)
|
95
|
-
|
96
|
-
# Standard Adam update.
|
97
|
-
next_m = tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)
|
98
|
-
next_v = tf.multiply(self.beta_2, v) + tf.multiply(
|
99
|
-
1.0 - self.beta_2, tf.square(grad)
|
100
|
-
)
|
101
|
-
|
102
|
-
update = next_m / (tf.sqrt(next_v) + self.epsilon)
|
103
|
-
|
104
|
-
# Just adding the square of the weights to the loss function is *not*
|
105
|
-
# the correct way of using L2 regularization/weight decay with Adam,
|
106
|
-
# since that will interact with the m and v parameters in strange ways.
|
107
|
-
#
|
108
|
-
# Instead we want ot decay the weights in a manner that doesn't interact
|
109
|
-
# with the m/v parameters. This is equivalent to adding the square
|
110
|
-
# of the weights to the loss with plain (non-momentum) SGD.
|
111
|
-
if self._do_use_weight_decay(param_name):
|
112
|
-
update += self.weight_decay_rate * param
|
113
|
-
|
114
|
-
ratio = 1.0
|
115
|
-
if self._do_layer_adaptation(param_name):
|
116
|
-
w_norm = linalg_ops.norm(param, ord=2)
|
117
|
-
g_norm = linalg_ops.norm(update, ord=2)
|
118
|
-
ratio = array_ops.where(
|
119
|
-
math_ops.greater(w_norm, 0),
|
120
|
-
array_ops.where(
|
121
|
-
math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0
|
122
|
-
),
|
123
|
-
1.0,
|
124
|
-
)
|
125
|
-
|
126
|
-
update_with_lr = ratio * self.learning_rate * update
|
127
|
-
|
128
|
-
next_param = param - update_with_lr
|
129
|
-
|
130
|
-
assignments.extend(
|
131
|
-
[param.assign(next_param), m.assign(next_m), v.assign(next_v)]
|
132
|
-
)
|
133
|
-
return tf.group(*assignments, name=name)
|
134
|
-
|
135
|
-
def _do_use_weight_decay(self, param_name):
|
136
|
-
"""Whether to use L2 weight decay for `param_name`."""
|
137
|
-
if not self.weight_decay_rate:
|
138
|
-
return False
|
139
|
-
if self.exclude_from_weight_decay:
|
140
|
-
for r in self.exclude_from_weight_decay:
|
141
|
-
if re.search(r, param_name) is not None:
|
142
|
-
return False
|
143
|
-
return True
|
144
|
-
|
145
|
-
def _do_layer_adaptation(self, param_name):
|
146
|
-
"""Whether to do layer-wise learning rate adaptation for `param_name`."""
|
147
|
-
if self.exclude_from_layer_adaptation:
|
148
|
-
for r in self.exclude_from_layer_adaptation:
|
149
|
-
if re.search(r, param_name) is not None:
|
150
|
-
return False
|
151
|
-
return True
|
152
|
-
|
153
|
-
def _get_variable_name(self, param_name):
|
154
|
-
"""Get the variable name from the tensor name."""
|
155
|
-
m = re.match("^(.*):\\d+$", six.ensure_str(param_name))
|
156
|
-
if m is not None:
|
157
|
-
param_name = m.group(1)
|
158
|
-
return param_name
|
@@ -1,225 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2018 The Google AI Language Team Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
"""Functions and classes related to optimization (weight updates)."""
|
16
|
-
|
17
|
-
from __future__ import absolute_import
|
18
|
-
from __future__ import division
|
19
|
-
from __future__ import print_function
|
20
|
-
|
21
|
-
import re
|
22
|
-
import tensorflow as tf
|
23
|
-
import lamb_optimizer
|
24
|
-
from gpt_2_simple.src import memory_saving_gradients
|
25
|
-
|
26
|
-
|
27
|
-
def create_optimizer(
|
28
|
-
loss,
|
29
|
-
init_lr,
|
30
|
-
num_train_steps,
|
31
|
-
num_warmup_steps,
|
32
|
-
use_tpu,
|
33
|
-
optimizer="lamb",
|
34
|
-
poly_power=1.0,
|
35
|
-
start_warmup_step=0,
|
36
|
-
use_memory_saving_gradients=False
|
37
|
-
):
|
38
|
-
"""Creates an optimizer training op."""
|
39
|
-
global_step = tf.train.get_or_create_global_step()
|
40
|
-
|
41
|
-
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
|
42
|
-
|
43
|
-
# Implements linear decay of the learning rate.
|
44
|
-
learning_rate = tf.train.polynomial_decay(
|
45
|
-
learning_rate,
|
46
|
-
global_step,
|
47
|
-
num_train_steps,
|
48
|
-
end_learning_rate=0.0,
|
49
|
-
power=poly_power,
|
50
|
-
cycle=False,
|
51
|
-
)
|
52
|
-
|
53
|
-
# Implements linear warmup. I.e., if global_step - start_warmup_step <
|
54
|
-
# num_warmup_steps, the learning rate will be
|
55
|
-
# `(global_step - start_warmup_step)/num_warmup_steps * init_lr`.
|
56
|
-
if num_warmup_steps:
|
57
|
-
tf.logging.info(
|
58
|
-
"++++++ warmup starts at step "
|
59
|
-
+ str(start_warmup_step)
|
60
|
-
+ ", for "
|
61
|
-
+ str(num_warmup_steps)
|
62
|
-
+ " steps ++++++"
|
63
|
-
)
|
64
|
-
global_steps_int = tf.cast(global_step, tf.int32)
|
65
|
-
start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32)
|
66
|
-
global_steps_int = global_steps_int - start_warm_int
|
67
|
-
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
|
68
|
-
|
69
|
-
global_steps_float = tf.cast(global_steps_int, tf.float32)
|
70
|
-
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
|
71
|
-
|
72
|
-
warmup_percent_done = global_steps_float / warmup_steps_float
|
73
|
-
warmup_learning_rate = init_lr * warmup_percent_done
|
74
|
-
|
75
|
-
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
|
76
|
-
learning_rate = (
|
77
|
-
1.0 - is_warmup
|
78
|
-
) * learning_rate + is_warmup * warmup_learning_rate
|
79
|
-
|
80
|
-
# It is OK that you use this optimizer for finetuning, since this
|
81
|
-
# is how the model was trained (note that the Adam m/v variables are NOT
|
82
|
-
# loaded from init_checkpoint.)
|
83
|
-
# It is OK to use AdamW in the finetuning even the model is trained by LAMB.
|
84
|
-
# As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune
|
85
|
-
# is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a
|
86
|
-
# batch size of 64 in the finetune.
|
87
|
-
if optimizer == "adamw":
|
88
|
-
tf.logging.info("using adamw")
|
89
|
-
optimizer = AdamWeightDecayOptimizer(
|
90
|
-
learning_rate=learning_rate,
|
91
|
-
weight_decay_rate=0.01,
|
92
|
-
beta_1=0.9,
|
93
|
-
beta_2=0.999,
|
94
|
-
epsilon=1e-6,
|
95
|
-
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
|
96
|
-
)
|
97
|
-
elif optimizer == "lamb":
|
98
|
-
tf.logging.info("using lamb")
|
99
|
-
optimizer = lamb_optimizer.LAMBOptimizer(
|
100
|
-
learning_rate=learning_rate,
|
101
|
-
weight_decay_rate=0.01,
|
102
|
-
beta_1=0.9,
|
103
|
-
beta_2=0.999,
|
104
|
-
epsilon=1e-6,
|
105
|
-
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
|
106
|
-
)
|
107
|
-
else:
|
108
|
-
raise ValueError("Not supported optimizer: ", optimizer)
|
109
|
-
|
110
|
-
if use_tpu:
|
111
|
-
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
|
112
|
-
|
113
|
-
tvars = tf.trainable_variables()
|
114
|
-
|
115
|
-
|
116
|
-
if use_memory_saving_gradients:
|
117
|
-
grads = memory_saving_gradients.gradients(loss, tvars)
|
118
|
-
else:
|
119
|
-
grads = tf.gradients(ys=loss, xs=tvars)
|
120
|
-
|
121
|
-
|
122
|
-
# This is how bert was pre-trained.
|
123
|
-
#(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
|
124
|
-
|
125
|
-
train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
|
126
|
-
|
127
|
-
# Normally the global step update is done inside of `apply_gradients`.
|
128
|
-
# However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this.
|
129
|
-
# But if you use a different optimizer, you should probably take this line
|
130
|
-
# out.
|
131
|
-
new_global_step = global_step + 1
|
132
|
-
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
|
133
|
-
return train_op
|
134
|
-
|
135
|
-
|
136
|
-
class AdamWeightDecayOptimizer(tf.train.Optimizer):
|
137
|
-
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
|
138
|
-
|
139
|
-
def __init__(
|
140
|
-
self,
|
141
|
-
learning_rate,
|
142
|
-
weight_decay_rate=0.0,
|
143
|
-
beta_1=0.9,
|
144
|
-
beta_2=0.999,
|
145
|
-
epsilon=1e-6,
|
146
|
-
exclude_from_weight_decay=None,
|
147
|
-
name="AdamWeightDecayOptimizer",
|
148
|
-
):
|
149
|
-
"""Constructs a AdamWeightDecayOptimizer."""
|
150
|
-
super(AdamWeightDecayOptimizer, self).__init__(False, name)
|
151
|
-
|
152
|
-
self.learning_rate = learning_rate
|
153
|
-
self.weight_decay_rate = weight_decay_rate
|
154
|
-
self.beta_1 = beta_1
|
155
|
-
self.beta_2 = beta_2
|
156
|
-
self.epsilon = epsilon
|
157
|
-
self.exclude_from_weight_decay = exclude_from_weight_decay
|
158
|
-
|
159
|
-
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
|
160
|
-
"""See base class."""
|
161
|
-
assignments = []
|
162
|
-
for (grad, param) in grads_and_vars:
|
163
|
-
if grad is None or param is None:
|
164
|
-
continue
|
165
|
-
|
166
|
-
param_name = self._get_variable_name(param.name)
|
167
|
-
|
168
|
-
m = tf.get_variable(
|
169
|
-
name=param_name + "/adam_m",
|
170
|
-
shape=param.shape.as_list(),
|
171
|
-
dtype=tf.float32,
|
172
|
-
trainable=False,
|
173
|
-
initializer=tf.zeros_initializer(),
|
174
|
-
)
|
175
|
-
v = tf.get_variable(
|
176
|
-
name=param_name + "/adam_v",
|
177
|
-
shape=param.shape.as_list(),
|
178
|
-
dtype=tf.float32,
|
179
|
-
trainable=False,
|
180
|
-
initializer=tf.zeros_initializer(),
|
181
|
-
)
|
182
|
-
|
183
|
-
# Standard Adam update.
|
184
|
-
next_m = tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)
|
185
|
-
next_v = tf.multiply(self.beta_2, v) + tf.multiply(
|
186
|
-
1.0 - self.beta_2, tf.square(grad)
|
187
|
-
)
|
188
|
-
|
189
|
-
update = next_m / (tf.sqrt(next_v) + self.epsilon)
|
190
|
-
|
191
|
-
# Just adding the square of the weights to the loss function is *not*
|
192
|
-
# the correct way of using L2 regularization/weight decay with Adam,
|
193
|
-
# since that will interact with the m and v parameters in strange ways.
|
194
|
-
#
|
195
|
-
# Instead we want ot decay the weights in a manner that doesn't interact
|
196
|
-
# with the m/v parameters. This is equivalent to adding the square
|
197
|
-
# of the weights to the loss with plain (non-momentum) SGD.
|
198
|
-
if self._do_use_weight_decay(param_name):
|
199
|
-
update += self.weight_decay_rate * param
|
200
|
-
|
201
|
-
update_with_lr = self.learning_rate * update
|
202
|
-
|
203
|
-
next_param = param - update_with_lr
|
204
|
-
|
205
|
-
assignments.extend(
|
206
|
-
[param.assign(next_param), m.assign(next_m), v.assign(next_v)]
|
207
|
-
)
|
208
|
-
return tf.group(*assignments, name=name)
|
209
|
-
|
210
|
-
def _do_use_weight_decay(self, param_name):
|
211
|
-
"""Whether to use L2 weight decay for `param_name`."""
|
212
|
-
if not self.weight_decay_rate:
|
213
|
-
return False
|
214
|
-
if self.exclude_from_weight_decay:
|
215
|
-
for r in self.exclude_from_weight_decay:
|
216
|
-
if re.search(r, param_name) is not None:
|
217
|
-
return False
|
218
|
-
return True
|
219
|
-
|
220
|
-
def _get_variable_name(self, param_name):
|
221
|
-
"""Get the variable name from the tensor name."""
|
222
|
-
m = re.match("^(.*):\\d+$", param_name)
|
223
|
-
if m is not None:
|
224
|
-
param_name = m.group(1)
|
225
|
-
return param_name
|