SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,234 +0,0 @@
|
|
1
|
-
# Original work Copyright 2018 The Google AI Language Team Authors.
|
2
|
-
# Modified work Copyright 2019 Rowan Zellers
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
import re
|
16
|
-
import tensorflow as tf
|
17
|
-
from grover.utils import get_shape_list
|
18
|
-
|
19
|
-
|
20
|
-
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
|
21
|
-
"""Creates an optimizer training op."""
|
22
|
-
global_step = tf.train.get_or_create_global_step()
|
23
|
-
|
24
|
-
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
|
25
|
-
|
26
|
-
# Implements linear decay of the learning rate.
|
27
|
-
learning_rate = tf.train.polynomial_decay(
|
28
|
-
learning_rate,
|
29
|
-
global_step,
|
30
|
-
num_train_steps,
|
31
|
-
end_learning_rate=0.0,
|
32
|
-
power=1.0,
|
33
|
-
cycle=False)
|
34
|
-
|
35
|
-
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
|
36
|
-
# learning rate will be `global_step/num_warmup_steps * init_lr`.
|
37
|
-
if num_warmup_steps:
|
38
|
-
global_steps_int = tf.cast(global_step, tf.int32)
|
39
|
-
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
|
40
|
-
|
41
|
-
global_steps_float = tf.cast(global_steps_int, tf.float32)
|
42
|
-
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
|
43
|
-
|
44
|
-
warmup_percent_done = global_steps_float / warmup_steps_float
|
45
|
-
warmup_learning_rate = init_lr * warmup_percent_done
|
46
|
-
|
47
|
-
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
|
48
|
-
learning_rate = (
|
49
|
-
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
|
50
|
-
|
51
|
-
# It is recommended that you use this optimizer for fine tuning, since this
|
52
|
-
# is how the model was trained (note that the Adam m/v variables are NOT
|
53
|
-
# loaded from init_checkpoint.)
|
54
|
-
optimizer = AdaFactorOptimizer(
|
55
|
-
learning_rate=learning_rate,
|
56
|
-
weight_decay_rate=0.01,
|
57
|
-
beta_1=0.9,
|
58
|
-
beta_2=0.999,
|
59
|
-
epsilon=1e-6,
|
60
|
-
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
|
61
|
-
|
62
|
-
if use_tpu:
|
63
|
-
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
|
64
|
-
|
65
|
-
tvars = tf.trainable_variables()
|
66
|
-
grads = tf.gradients(loss, tvars)
|
67
|
-
|
68
|
-
# You could do this, but instead we don't because a) it's slow and b) we already did the 'update clipping'
|
69
|
-
# (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
|
70
|
-
|
71
|
-
train_op = optimizer.apply_gradients(
|
72
|
-
zip(grads, tvars), global_step=global_step)
|
73
|
-
|
74
|
-
# Normally the global step update is done inside of `apply_gradients`.
|
75
|
-
# However, `AdaFactorOptimizer` doesn't do this. But if you use
|
76
|
-
# a different optimizer, you should probably take this line out.
|
77
|
-
new_global_step = global_step + 1
|
78
|
-
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
|
79
|
-
|
80
|
-
train_metrics = {
|
81
|
-
'learning_rate': learning_rate,
|
82
|
-
'minibatch_loss': loss,
|
83
|
-
# 'minibatch_ppl': tf.math.exp(loss),
|
84
|
-
}
|
85
|
-
return train_op, train_metrics
|
86
|
-
|
87
|
-
|
88
|
-
class AdaFactorOptimizer(tf.compat.v1.train.Optimizer):
|
89
|
-
"""here's the optimizer we'll use"""
|
90
|
-
|
91
|
-
def __init__(self,
|
92
|
-
learning_rate,
|
93
|
-
weight_decay_rate=0.0,
|
94
|
-
beta_1=0.9,
|
95
|
-
beta_2=0.999,
|
96
|
-
epsilon=1e-6,
|
97
|
-
exclude_from_weight_decay=None,
|
98
|
-
clipping_rate=1.0,
|
99
|
-
name="AdaFactorOptimizer"):
|
100
|
-
"""Constructs a AdaFactorOptimizer."""
|
101
|
-
super(AdaFactorOptimizer, self).__init__(False, name)
|
102
|
-
|
103
|
-
self.learning_rate = learning_rate
|
104
|
-
self.weight_decay_rate = weight_decay_rate
|
105
|
-
self.beta_1 = beta_1
|
106
|
-
self.beta_2 = beta_2
|
107
|
-
self.epsilon = epsilon
|
108
|
-
self.epsilon1 = 1e-30
|
109
|
-
self.epsilon2 = 0.001
|
110
|
-
self.clipping_rate = clipping_rate
|
111
|
-
self.exclude_from_weight_decay = exclude_from_weight_decay
|
112
|
-
self.use_locking = False
|
113
|
-
|
114
|
-
def _use_factored(self, shape):
|
115
|
-
return len(shape) >= 2
|
116
|
-
|
117
|
-
def _parameter_scale(self, var):
|
118
|
-
"""Estimate the scale of the parameters from the current values.
|
119
|
-
We include a minimum value of 0.001 to give it a chance to escape 0
|
120
|
-
if it was zero-initialized.
|
121
|
-
Instead of using the value, we could impute the scale from the shape,
|
122
|
-
as initializers do.
|
123
|
-
Args:
|
124
|
-
var: a variable or Tensor.
|
125
|
-
Returns:
|
126
|
-
a Scalar
|
127
|
-
"""
|
128
|
-
return tf.maximum(reduce_rms(var), self.epsilon2)
|
129
|
-
|
130
|
-
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
|
131
|
-
"""See base class."""
|
132
|
-
assignments = []
|
133
|
-
for (grad, param) in grads_and_vars:
|
134
|
-
if grad is None or param is None:
|
135
|
-
continue
|
136
|
-
|
137
|
-
param_name = self._get_variable_name(param.name)
|
138
|
-
shape_list = get_shape_list(param, expected_rank=[1, 2])
|
139
|
-
|
140
|
-
# decay_rate = 1 - tf.pow(tf.cast(tf.train.get_or_create_global_step(), tf.float32) + 1.0, -0.8)
|
141
|
-
decay_rate = self.beta_2
|
142
|
-
grad_squared = tf.square(grad) + self.epsilon1
|
143
|
-
|
144
|
-
update_scale = self.learning_rate
|
145
|
-
# update_scale = self.learning_rate * tf.cast(self._parameter_scale(param), dtype=tf.float32)
|
146
|
-
|
147
|
-
# HACK: Make things dependent on grad.
|
148
|
-
# This confounds the XLA rewriter and keeps it from fusing computations
|
149
|
-
# across different variables. This fusion is a bad for HBM usage, since
|
150
|
-
# it causes the gradients to persist in memory.
|
151
|
-
grad_squared_mean = tf.reduce_mean(grad_squared)
|
152
|
-
decay_rate += grad_squared_mean * 1e-30
|
153
|
-
update_scale += grad_squared_mean * 1e-30
|
154
|
-
|
155
|
-
# END HACK
|
156
|
-
|
157
|
-
if self._use_factored(shape_list):
|
158
|
-
num_rows, num_columns = shape_list
|
159
|
-
|
160
|
-
vr = tf.get_variable(
|
161
|
-
name=param_name + "/adafactor_vr",
|
162
|
-
shape=[num_rows],
|
163
|
-
dtype=tf.float32,
|
164
|
-
trainable=False,
|
165
|
-
initializer=tf.zeros_initializer())
|
166
|
-
vc = tf.get_variable(
|
167
|
-
name=param_name + "/adafactor_vc",
|
168
|
-
shape=[num_columns],
|
169
|
-
dtype=tf.float32,
|
170
|
-
trainable=False,
|
171
|
-
initializer=tf.zeros_initializer())
|
172
|
-
|
173
|
-
next_vr = decay_rate * vr + (1 - decay_rate) * tf.reduce_mean(grad_squared, 1)
|
174
|
-
next_vc = decay_rate * vc + (1 - decay_rate) * tf.reduce_mean(grad_squared, 0)
|
175
|
-
|
176
|
-
long_term_mean = tf.reduce_mean(next_vr, -1, keepdims=True)
|
177
|
-
r_factor = tf.rsqrt(next_vr / long_term_mean + self.epsilon1)
|
178
|
-
c_factor = tf.rsqrt(next_vc + self.epsilon1)
|
179
|
-
update = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(c_factor, -2)
|
180
|
-
|
181
|
-
assignments.append(vr.assign(next_vr, use_locking=self.use_locking))
|
182
|
-
assignments.append(vc.assign(next_vc, use_locking=self.use_locking))
|
183
|
-
else:
|
184
|
-
v = tf.get_variable(
|
185
|
-
name=param_name + "/adafactor_v",
|
186
|
-
shape=shape_list,
|
187
|
-
dtype=tf.float32,
|
188
|
-
trainable=False,
|
189
|
-
initializer=tf.zeros_initializer())
|
190
|
-
next_v = decay_rate * v + (1 - decay_rate) * grad_squared
|
191
|
-
|
192
|
-
assignments.append(v.assign(next_v, use_locking=self.use_locking))
|
193
|
-
update = grad * tf.rsqrt(next_v + self.epsilon1)
|
194
|
-
|
195
|
-
clipping_denom = tf.maximum(1.0, reduce_rms(update) / self.clipping_rate)
|
196
|
-
update /= clipping_denom
|
197
|
-
|
198
|
-
# Do weight decay
|
199
|
-
# Just adding the square of the weights to the loss function is *not*
|
200
|
-
# the correct way of using L2 regularization/weight decay with Adam,
|
201
|
-
# since that will interact with the m and v parameters in strange ways.
|
202
|
-
#
|
203
|
-
# Instead we want ot decay the weights in a manner that doesn't interact
|
204
|
-
# with the m/v parameters. This is equivalent to adding the square
|
205
|
-
# # of the weights to the loss with plain (non-momentum) SGD.
|
206
|
-
if self._do_use_weight_decay(param_name):
|
207
|
-
update += self.weight_decay_rate * param
|
208
|
-
|
209
|
-
update_with_lr = update_scale * update
|
210
|
-
next_param = param - update_with_lr
|
211
|
-
|
212
|
-
assignments.append(param.assign(next_param, use_locking=self.use_locking))
|
213
|
-
return tf.group(*assignments, name=name)
|
214
|
-
|
215
|
-
def _do_use_weight_decay(self, param_name):
|
216
|
-
"""Whether to use L2 weight decay for `param_name`."""
|
217
|
-
if not self.weight_decay_rate:
|
218
|
-
return False
|
219
|
-
if self.exclude_from_weight_decay:
|
220
|
-
for r in self.exclude_from_weight_decay:
|
221
|
-
if re.search(r, param_name) is not None:
|
222
|
-
return False
|
223
|
-
return True
|
224
|
-
|
225
|
-
def _get_variable_name(self, param_name):
|
226
|
-
"""Get the variable name from the tensor name."""
|
227
|
-
m = re.match("^(.*):\\d+$", param_name)
|
228
|
-
if m is not None:
|
229
|
-
param_name = m.group(1)
|
230
|
-
return param_name
|
231
|
-
|
232
|
-
|
233
|
-
def reduce_rms(x):
|
234
|
-
return tf.sqrt(tf.reduce_mean(tf.square(x)))
|
@@ -1,187 +0,0 @@
|
|
1
|
-
# Original work Copyright 2018 The Google AI Language Team Authors.
|
2
|
-
# Modified work Copyright 2019 Rowan Zellers
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
""" Training script! From https://github.com/imcaspar/gpt2-ml"""
|
17
|
-
|
18
|
-
import tensorflow as tf
|
19
|
-
import os
|
20
|
-
|
21
|
-
from grover.dataloader import input_fn_builder
|
22
|
-
from grover.modeling import model_fn_builder, GroverConfig
|
23
|
-
|
24
|
-
flags = tf.flags
|
25
|
-
|
26
|
-
FLAGS = flags.FLAGS
|
27
|
-
|
28
|
-
## Required parameters
|
29
|
-
flags.DEFINE_string(
|
30
|
-
"config_file", 'configs/base.json',
|
31
|
-
"The config json file corresponding to the pre-trained news model. "
|
32
|
-
"This specifies the model architecture.")
|
33
|
-
|
34
|
-
flags.DEFINE_string(
|
35
|
-
"input_file", None,
|
36
|
-
"Input TF example files (can be a glob or comma separated).")
|
37
|
-
|
38
|
-
flags.DEFINE_string(
|
39
|
-
"output_dir", None,
|
40
|
-
"The output directory where the model checkpoints will be written.")
|
41
|
-
|
42
|
-
## Other parameters
|
43
|
-
flags.DEFINE_string(
|
44
|
-
"init_checkpoint", None,
|
45
|
-
"Initial checkpoint (usually from a pre-trained model).")
|
46
|
-
|
47
|
-
flags.DEFINE_integer(
|
48
|
-
"max_seq_length", 1024,
|
49
|
-
"The maximum total input sequence length after BPE tokenization. "
|
50
|
-
"Sequences longer than this will be truncated, and sequences shorter "
|
51
|
-
"than this will be padded. Must match data generation.")
|
52
|
-
|
53
|
-
flags.DEFINE_bool("do_train", False, "Whether to run training.")
|
54
|
-
|
55
|
-
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
|
56
|
-
|
57
|
-
flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
|
58
|
-
|
59
|
-
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for evaluation.")
|
60
|
-
|
61
|
-
flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for adafactor.")
|
62
|
-
|
63
|
-
flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.")
|
64
|
-
|
65
|
-
flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
|
66
|
-
|
67
|
-
flags.DEFINE_integer("save_checkpoints_steps", 1000,
|
68
|
-
"How often to save the model checkpoint.")
|
69
|
-
|
70
|
-
flags.DEFINE_integer("iterations_per_loop", 1000,
|
71
|
-
"How many steps to make in each estimator call.")
|
72
|
-
|
73
|
-
flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
|
74
|
-
|
75
|
-
flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
|
76
|
-
|
77
|
-
flags.DEFINE_string(
|
78
|
-
"tpu_name", None,
|
79
|
-
"The Cloud TPU to use for training. This should be either the name "
|
80
|
-
"used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
|
81
|
-
"url.")
|
82
|
-
|
83
|
-
flags.DEFINE_string(
|
84
|
-
"tpu_zone", None,
|
85
|
-
"[Optional] GCE zone where the Cloud TPU is located in. If not "
|
86
|
-
"specified, we will attempt to automatically detect the GCE project from "
|
87
|
-
"metadata.")
|
88
|
-
|
89
|
-
flags.DEFINE_string(
|
90
|
-
"gcp_project", None,
|
91
|
-
"[Optional] Project name for the Cloud TPU-enabled project. If not "
|
92
|
-
"specified, we will attempt to automatically detect the GCE project from "
|
93
|
-
"metadata.")
|
94
|
-
|
95
|
-
flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
|
96
|
-
|
97
|
-
flags.DEFINE_integer(
|
98
|
-
"num_tpu_cores", 8,
|
99
|
-
"Only used if `use_tpu` is True. Total number of TPU cores to use.")
|
100
|
-
|
101
|
-
|
102
|
-
def main(_):
|
103
|
-
tf.logging.set_verbosity(tf.logging.INFO)
|
104
|
-
logger = tf.get_logger()
|
105
|
-
logger.propagate = False
|
106
|
-
|
107
|
-
news_config = GroverConfig.from_json_file(FLAGS.config_file)
|
108
|
-
|
109
|
-
tf.gfile.MakeDirs(FLAGS.output_dir)
|
110
|
-
|
111
|
-
input_files = []
|
112
|
-
for input_pattern in FLAGS.input_file.split(","):
|
113
|
-
input_files.extend(tf.gfile.Glob(input_pattern))
|
114
|
-
|
115
|
-
# tf.logging.info("*** Input Files ***")
|
116
|
-
# for input_file in input_files:
|
117
|
-
# tf.logging.info(" %s" % input_file)
|
118
|
-
|
119
|
-
tpu_cluster_resolver = None
|
120
|
-
if FLAGS.use_tpu and FLAGS.tpu_name:
|
121
|
-
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
|
122
|
-
FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
|
123
|
-
|
124
|
-
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
|
125
|
-
run_config = tf.contrib.tpu.RunConfig(
|
126
|
-
cluster=tpu_cluster_resolver,
|
127
|
-
master=FLAGS.master,
|
128
|
-
model_dir=FLAGS.output_dir,
|
129
|
-
save_checkpoints_steps=FLAGS.save_checkpoints_steps,
|
130
|
-
keep_checkpoint_max=None,
|
131
|
-
tpu_config=tf.contrib.tpu.TPUConfig(
|
132
|
-
iterations_per_loop=FLAGS.iterations_per_loop,
|
133
|
-
num_shards=FLAGS.num_tpu_cores,
|
134
|
-
per_host_input_for_training=is_per_host))
|
135
|
-
|
136
|
-
model_fn = model_fn_builder(news_config, init_checkpoint=FLAGS.init_checkpoint,
|
137
|
-
learning_rate=FLAGS.learning_rate,
|
138
|
-
num_train_steps=FLAGS.num_train_steps,
|
139
|
-
num_warmup_steps=FLAGS.num_warmup_steps,
|
140
|
-
use_tpu=FLAGS.use_tpu,
|
141
|
-
num_tpu_cores=FLAGS.num_tpu_cores,
|
142
|
-
eval_batch_size=FLAGS.eval_batch_size
|
143
|
-
)
|
144
|
-
|
145
|
-
# If TPU is not available, this will fall back to normal Estimator on CPU
|
146
|
-
# or GPU.
|
147
|
-
estimator = tf.contrib.tpu.TPUEstimator(
|
148
|
-
use_tpu=FLAGS.use_tpu,
|
149
|
-
model_fn=model_fn,
|
150
|
-
config=run_config,
|
151
|
-
train_batch_size=FLAGS.train_batch_size,
|
152
|
-
eval_batch_size=FLAGS.eval_batch_size,
|
153
|
-
params={'model_dir': FLAGS.output_dir}
|
154
|
-
)
|
155
|
-
if FLAGS.do_train:
|
156
|
-
tf.logging.info("***** Running training *****")
|
157
|
-
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
|
158
|
-
train_input_fn = input_fn_builder(
|
159
|
-
input_files=input_files,
|
160
|
-
seq_length=FLAGS.max_seq_length,
|
161
|
-
is_training=True)
|
162
|
-
|
163
|
-
estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
|
164
|
-
|
165
|
-
if FLAGS.do_eval:
|
166
|
-
tf.logging.info("***** Running evaluation *****")
|
167
|
-
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
|
168
|
-
|
169
|
-
eval_input_fn = input_fn_builder(
|
170
|
-
input_files=input_files,
|
171
|
-
seq_length=FLAGS.max_seq_length,
|
172
|
-
is_training=False,
|
173
|
-
)
|
174
|
-
|
175
|
-
result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
|
176
|
-
|
177
|
-
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
|
178
|
-
with tf.gfile.GFile(output_eval_file, "w") as writer:
|
179
|
-
tf.logging.info("***** Eval results *****")
|
180
|
-
for key in sorted(result.keys()):
|
181
|
-
tf.logging.info(" %s = %s", key, str(result[key]))
|
182
|
-
writer.write("%s = %s\n" % (key, str(result[key])))
|
183
|
-
|
184
|
-
if __name__ == "__main__":
|
185
|
-
flags.mark_flag_as_required("input_file")
|
186
|
-
flags.mark_flag_as_required("output_dir")
|
187
|
-
tf.app.run()
|
@@ -1,234 +0,0 @@
|
|
1
|
-
# Original work Copyright 2018 The Google AI Language Team Authors.
|
2
|
-
# Modified work Copyright 2019 Rowan Zellers
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
import collections
|
17
|
-
import re
|
18
|
-
|
19
|
-
import six
|
20
|
-
import tensorflow as tf
|
21
|
-
import numpy as np
|
22
|
-
from tensorflow.python.lib.io import file_io
|
23
|
-
|
24
|
-
|
25
|
-
def _save_np(absolute_fn, array):
|
26
|
-
if absolute_fn.startswith('gs://'):
|
27
|
-
with file_io.FileIO(absolute_fn, 'w') as f:
|
28
|
-
np.save(f, array)
|
29
|
-
else:
|
30
|
-
np.save(absolute_fn, array)
|
31
|
-
|
32
|
-
|
33
|
-
def assert_rank(tensor, expected_rank, name=None):
|
34
|
-
"""Raises an exception if the tensor rank is not of the expected rank.
|
35
|
-
|
36
|
-
Args:
|
37
|
-
tensor: A tf.Tensor to check the rank of.
|
38
|
-
expected_rank: Python integer or list of integers, expected rank.
|
39
|
-
name: Optional name of the tensor for the error message.
|
40
|
-
|
41
|
-
Raises:
|
42
|
-
ValueError: If the expected shape doesn't match the actual shape.
|
43
|
-
"""
|
44
|
-
if name is None:
|
45
|
-
name = tensor.name
|
46
|
-
|
47
|
-
expected_rank_dict = {}
|
48
|
-
if isinstance(expected_rank, six.integer_types):
|
49
|
-
expected_rank_dict[expected_rank] = True
|
50
|
-
else:
|
51
|
-
for x in expected_rank:
|
52
|
-
expected_rank_dict[x] = True
|
53
|
-
|
54
|
-
actual_rank = tensor.shape.ndims
|
55
|
-
if actual_rank not in expected_rank_dict:
|
56
|
-
scope_name = tf.get_variable_scope().name
|
57
|
-
raise ValueError(
|
58
|
-
"For the tensor `%s` in scope `%s`, the actual rank "
|
59
|
-
"`%d` (shape = %s) is not equal to the expected rank `%s`" %
|
60
|
-
(name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
|
61
|
-
|
62
|
-
|
63
|
-
def get_shape_list(tensor, expected_rank=None, name=None):
|
64
|
-
"""Returns a list of the shape of tensor, preferring static dimensions.
|
65
|
-
|
66
|
-
Args:
|
67
|
-
tensor: A tf.Tensor object to find the shape of.
|
68
|
-
expected_rank: (optional) int. The expected rank of `tensor`. If this is
|
69
|
-
specified and the `tensor` has a different rank, and exception will be
|
70
|
-
thrown.
|
71
|
-
name: Optional name of the tensor for the error message.
|
72
|
-
|
73
|
-
Returns:
|
74
|
-
A list of dimensions of the shape of tensor. All static dimensions will
|
75
|
-
be returned as python integers, and dynamic dimensions will be returned
|
76
|
-
as tf.Tensor scalars.
|
77
|
-
"""
|
78
|
-
if name is None:
|
79
|
-
name = tensor.name
|
80
|
-
|
81
|
-
if expected_rank is not None:
|
82
|
-
assert_rank(tensor, expected_rank, name)
|
83
|
-
|
84
|
-
shape = tensor.shape.as_list()
|
85
|
-
|
86
|
-
non_static_indexes = []
|
87
|
-
for (index, dim) in enumerate(shape):
|
88
|
-
if dim is None:
|
89
|
-
non_static_indexes.append(index)
|
90
|
-
|
91
|
-
if not non_static_indexes:
|
92
|
-
return shape
|
93
|
-
|
94
|
-
dyn_shape = tf.shape(tensor)
|
95
|
-
for index in non_static_indexes:
|
96
|
-
shape[index] = dyn_shape[index]
|
97
|
-
return shape
|
98
|
-
|
99
|
-
|
100
|
-
def gelu(input_tensor):
|
101
|
-
"""Gaussian Error Linear Unit.
|
102
|
-
|
103
|
-
This is a smoother version of the RELU.
|
104
|
-
Original paper: https://arxiv.org/abs/1606.08415
|
105
|
-
|
106
|
-
Args:
|
107
|
-
input_tensor: float Tensor to perform activation.
|
108
|
-
|
109
|
-
Returns:
|
110
|
-
`input_tensor` with the GELU activation applied.
|
111
|
-
"""
|
112
|
-
cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
|
113
|
-
return input_tensor * cdf
|
114
|
-
|
115
|
-
|
116
|
-
def layer_norm(input_tensor, name=None, epsilon=1e-5):
|
117
|
-
"""Run layer normalization on the last dimension of the tensor."""
|
118
|
-
name2use = f'LayerNorm_{name}' if name is not None else name
|
119
|
-
with tf.variable_scope(name2use, default_name='LayerNorm'):
|
120
|
-
dim = input_tensor.shape[-1].value
|
121
|
-
gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1))
|
122
|
-
beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0))
|
123
|
-
mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True)
|
124
|
-
std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True)
|
125
|
-
input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon)
|
126
|
-
input_tensor = input_tensor * gamma + beta
|
127
|
-
return input_tensor
|
128
|
-
|
129
|
-
|
130
|
-
def dropout(input_tensor, dropout_prob):
|
131
|
-
"""Perform dropout.
|
132
|
-
|
133
|
-
Args:
|
134
|
-
input_tensor: float Tensor.
|
135
|
-
dropout_prob: Python float. The probability of dropping out a value (NOT of
|
136
|
-
*keeping* a dimension as in `tf.nn.dropout`).
|
137
|
-
|
138
|
-
Returns:
|
139
|
-
A version of `input_tensor` with dropout applied.
|
140
|
-
"""
|
141
|
-
if dropout_prob is None or dropout_prob == 0.0:
|
142
|
-
return input_tensor
|
143
|
-
output = tf.nn.dropout(input_tensor, rate=dropout_prob)
|
144
|
-
return output
|
145
|
-
|
146
|
-
|
147
|
-
def get_attention_mask(nd, ns, *, dtype):
|
148
|
-
"""
|
149
|
-
this is a TPU compatible version of tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd)
|
150
|
-
where the lower right triangle contains 1s
|
151
|
-
"""
|
152
|
-
i = tf.range(nd)[:, None]
|
153
|
-
j = tf.range(ns)
|
154
|
-
m = i >= j - ns + nd
|
155
|
-
return tf.cast(m, dtype)
|
156
|
-
|
157
|
-
|
158
|
-
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
|
159
|
-
"""Compute the union of the current variables and checkpoint variables."""
|
160
|
-
assignment_map = {}
|
161
|
-
initialized_variable_names = {}
|
162
|
-
|
163
|
-
name_to_variable = collections.OrderedDict()
|
164
|
-
for var in tvars:
|
165
|
-
name = var.name
|
166
|
-
m = re.match("^(.*):\\d+$", name)
|
167
|
-
if m is not None:
|
168
|
-
name = m.group(1)
|
169
|
-
name_to_variable[name] = var
|
170
|
-
|
171
|
-
init_vars = tf.train.list_variables(init_checkpoint)
|
172
|
-
|
173
|
-
assignment_map = collections.OrderedDict()
|
174
|
-
for x in init_vars:
|
175
|
-
(name, var) = (x[0], x[1])
|
176
|
-
if name not in name_to_variable:
|
177
|
-
continue
|
178
|
-
assignment_map[name] = name
|
179
|
-
initialized_variable_names[name] = 1
|
180
|
-
initialized_variable_names[name + ":0"] = 1
|
181
|
-
return (assignment_map, initialized_variable_names)
|
182
|
-
|
183
|
-
|
184
|
-
def construct_scalar_host_call(metric_dict, model_dir, prefix=""):
|
185
|
-
"""Construct a host call to log scalars when training on TPU.
|
186
|
-
|
187
|
-
Args:
|
188
|
-
metric_dict: A dict of the tensors to be logged.
|
189
|
-
model_dir: The location to write the summary.
|
190
|
-
prefix: The prefix (if any) to prepend to the metric names.
|
191
|
-
|
192
|
-
Returns:
|
193
|
-
A tuple of (function, args_to_be_passed_to_said_function)
|
194
|
-
"""
|
195
|
-
metric_names = list(metric_dict.keys())
|
196
|
-
|
197
|
-
def host_call_fn(global_step, *args):
|
198
|
-
"""Training host call. Creates scalar summaries for training metrics.
|
199
|
-
|
200
|
-
This function is executed on the CPU and should not directly reference
|
201
|
-
any Tensors in the rest of the `model_fn`. To pass Tensors from the
|
202
|
-
model to the `metric_fn`, provide as part of the `host_call`. See
|
203
|
-
https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
|
204
|
-
for more information.
|
205
|
-
|
206
|
-
Arguments should match the list of `Tensor` objects passed as the second
|
207
|
-
element in the tuple passed to `host_call`.
|
208
|
-
|
209
|
-
Args:
|
210
|
-
global_step: `Tensor with shape `[batch]` for the global_step
|
211
|
-
*args: Remaining tensors to log.
|
212
|
-
|
213
|
-
Returns:
|
214
|
-
List of summary ops to run on the CPU host.
|
215
|
-
"""
|
216
|
-
step = global_step[0]
|
217
|
-
with tf.contrib.summary.create_file_writer(
|
218
|
-
logdir=model_dir, filename_suffix=".host_call").as_default():
|
219
|
-
with tf.contrib.summary.always_record_summaries():
|
220
|
-
for i, name in enumerate(metric_names):
|
221
|
-
tf.contrib.summary.scalar(prefix + name, args[i][0], step=step)
|
222
|
-
|
223
|
-
return tf.contrib.summary.all_summary_ops()
|
224
|
-
|
225
|
-
# To log the current learning rate, and gradient norm for Tensorboard, the
|
226
|
-
# summary op needs to be run on the host CPU via host_call. host_call
|
227
|
-
# expects [batch_size, ...] Tensors, thus reshape to introduce a batch
|
228
|
-
# dimension. These Tensors are implicitly concatenated to
|
229
|
-
# [params['batch_size']].
|
230
|
-
global_step_tensor = tf.reshape(
|
231
|
-
tf.compat.v1.train.get_or_create_global_step(), [1])
|
232
|
-
other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names]
|
233
|
-
|
234
|
-
return host_call_fn, [global_step_tensor] + other_tensors
|