SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
- SinaTools-1.0.1.dist-info/RECORD +73 -0
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +5 -7
- sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- sinatools/ner/trainers/BertTrainer.py +163 -163
- sinatools/ner/trainers/__init__.py +2 -2
- SinaTools-0.1.40.dist-info/RECORD +0 -123
- sinatools/arabert/arabert/__init__.py +0 -14
- sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools/arabert/arabert/optimization.py +0 -202
- sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,469 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Pre-trains an ELECTRA model."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import argparse
|
23
|
-
import collections
|
24
|
-
import json
|
25
|
-
|
26
|
-
import tensorflow as tf
|
27
|
-
|
28
|
-
import configure_pretraining
|
29
|
-
from model import modeling
|
30
|
-
from model import optimization
|
31
|
-
from pretrain import pretrain_data
|
32
|
-
from pretrain import pretrain_helpers
|
33
|
-
from util import training_utils
|
34
|
-
from util import utils
|
35
|
-
|
36
|
-
|
37
|
-
class PretrainingModel(object):
|
38
|
-
"""Transformer pre-training using the replaced-token-detection task."""
|
39
|
-
|
40
|
-
def __init__(self, config: configure_pretraining.PretrainingConfig,
|
41
|
-
features, is_training):
|
42
|
-
# Set up model config
|
43
|
-
self._config = config
|
44
|
-
self._bert_config = training_utils.get_bert_config(config)
|
45
|
-
if config.debug:
|
46
|
-
self._bert_config.num_hidden_layers = 3
|
47
|
-
self._bert_config.hidden_size = 144
|
48
|
-
self._bert_config.intermediate_size = 144 * 4
|
49
|
-
self._bert_config.num_attention_heads = 4
|
50
|
-
|
51
|
-
# Mask the input
|
52
|
-
unmasked_inputs = pretrain_data.features_to_inputs(features)
|
53
|
-
masked_inputs = pretrain_helpers.mask(
|
54
|
-
config, unmasked_inputs, config.mask_prob)
|
55
|
-
|
56
|
-
# Generator
|
57
|
-
embedding_size = (
|
58
|
-
self._bert_config.hidden_size if config.embedding_size is None else
|
59
|
-
config.embedding_size)
|
60
|
-
cloze_output = None
|
61
|
-
if config.uniform_generator:
|
62
|
-
# simple generator sampling fakes uniformly at random
|
63
|
-
mlm_output = self._get_masked_lm_output(masked_inputs, None)
|
64
|
-
elif ((config.electra_objective or config.electric_objective)
|
65
|
-
and config.untied_generator):
|
66
|
-
generator_config = get_generator_config(config, self._bert_config)
|
67
|
-
if config.two_tower_generator:
|
68
|
-
# two-tower cloze model generator used for electric
|
69
|
-
generator = TwoTowerClozeTransformer(
|
70
|
-
config, generator_config, unmasked_inputs, is_training,
|
71
|
-
embedding_size)
|
72
|
-
cloze_output = self._get_cloze_outputs(unmasked_inputs, generator)
|
73
|
-
mlm_output = get_softmax_output(
|
74
|
-
pretrain_helpers.gather_positions(
|
75
|
-
cloze_output.logits, masked_inputs.masked_lm_positions),
|
76
|
-
masked_inputs.masked_lm_ids, masked_inputs.masked_lm_weights,
|
77
|
-
self._bert_config.vocab_size)
|
78
|
-
else:
|
79
|
-
# small masked language model generator
|
80
|
-
generator = build_transformer(
|
81
|
-
config, masked_inputs, is_training, generator_config,
|
82
|
-
embedding_size=(None if config.untied_generator_embeddings
|
83
|
-
else embedding_size),
|
84
|
-
untied_embeddings=config.untied_generator_embeddings,
|
85
|
-
scope="generator")
|
86
|
-
mlm_output = self._get_masked_lm_output(masked_inputs, generator)
|
87
|
-
else:
|
88
|
-
# full-sized masked language model generator if using BERT objective or if
|
89
|
-
# the generator and discriminator have tied weights
|
90
|
-
generator = build_transformer(
|
91
|
-
config, masked_inputs, is_training, self._bert_config,
|
92
|
-
embedding_size=embedding_size)
|
93
|
-
mlm_output = self._get_masked_lm_output(masked_inputs, generator)
|
94
|
-
fake_data = self._get_fake_data(masked_inputs, mlm_output.logits)
|
95
|
-
self.mlm_output = mlm_output
|
96
|
-
self.total_loss = config.gen_weight * (
|
97
|
-
cloze_output.loss if config.two_tower_generator else mlm_output.loss)
|
98
|
-
|
99
|
-
# Discriminator
|
100
|
-
disc_output = None
|
101
|
-
if config.electra_objective or config.electric_objective:
|
102
|
-
discriminator = build_transformer(
|
103
|
-
config, fake_data.inputs, is_training, self._bert_config,
|
104
|
-
reuse=not config.untied_generator, embedding_size=embedding_size)
|
105
|
-
disc_output = self._get_discriminator_output(
|
106
|
-
fake_data.inputs, discriminator, fake_data.is_fake_tokens,
|
107
|
-
cloze_output)
|
108
|
-
self.total_loss += config.disc_weight * disc_output.loss
|
109
|
-
|
110
|
-
# Evaluation
|
111
|
-
eval_fn_inputs = {
|
112
|
-
"input_ids": masked_inputs.input_ids,
|
113
|
-
"masked_lm_preds": mlm_output.preds,
|
114
|
-
"mlm_loss": mlm_output.per_example_loss,
|
115
|
-
"masked_lm_ids": masked_inputs.masked_lm_ids,
|
116
|
-
"masked_lm_weights": masked_inputs.masked_lm_weights,
|
117
|
-
"input_mask": masked_inputs.input_mask
|
118
|
-
}
|
119
|
-
if config.electra_objective or config.electric_objective:
|
120
|
-
eval_fn_inputs.update({
|
121
|
-
"disc_loss": disc_output.per_example_loss,
|
122
|
-
"disc_labels": disc_output.labels,
|
123
|
-
"disc_probs": disc_output.probs,
|
124
|
-
"disc_preds": disc_output.preds,
|
125
|
-
"sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1,
|
126
|
-
output_type=tf.int32)
|
127
|
-
})
|
128
|
-
eval_fn_keys = eval_fn_inputs.keys()
|
129
|
-
eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys]
|
130
|
-
|
131
|
-
def metric_fn(*args):
|
132
|
-
"""Computes the loss and accuracy of the model."""
|
133
|
-
d = {k: arg for k, arg in zip(eval_fn_keys, args)}
|
134
|
-
metrics = dict()
|
135
|
-
metrics["masked_lm_accuracy"] = tf.metrics.accuracy(
|
136
|
-
labels=tf.reshape(d["masked_lm_ids"], [-1]),
|
137
|
-
predictions=tf.reshape(d["masked_lm_preds"], [-1]),
|
138
|
-
weights=tf.reshape(d["masked_lm_weights"], [-1]))
|
139
|
-
metrics["masked_lm_loss"] = tf.metrics.mean(
|
140
|
-
values=tf.reshape(d["mlm_loss"], [-1]),
|
141
|
-
weights=tf.reshape(d["masked_lm_weights"], [-1]))
|
142
|
-
if config.electra_objective or config.electric_objective:
|
143
|
-
metrics["sampled_masked_lm_accuracy"] = tf.metrics.accuracy(
|
144
|
-
labels=tf.reshape(d["masked_lm_ids"], [-1]),
|
145
|
-
predictions=tf.reshape(d["sampled_tokids"], [-1]),
|
146
|
-
weights=tf.reshape(d["masked_lm_weights"], [-1]))
|
147
|
-
if config.disc_weight > 0:
|
148
|
-
metrics["disc_loss"] = tf.metrics.mean(d["disc_loss"])
|
149
|
-
metrics["disc_auc"] = tf.metrics.auc(
|
150
|
-
d["disc_labels"] * d["input_mask"],
|
151
|
-
d["disc_probs"] * tf.cast(d["input_mask"], tf.float32))
|
152
|
-
metrics["disc_accuracy"] = tf.metrics.accuracy(
|
153
|
-
labels=d["disc_labels"], predictions=d["disc_preds"],
|
154
|
-
weights=d["input_mask"])
|
155
|
-
metrics["disc_precision"] = tf.metrics.accuracy(
|
156
|
-
labels=d["disc_labels"], predictions=d["disc_preds"],
|
157
|
-
weights=d["disc_preds"] * d["input_mask"])
|
158
|
-
metrics["disc_recall"] = tf.metrics.accuracy(
|
159
|
-
labels=d["disc_labels"], predictions=d["disc_preds"],
|
160
|
-
weights=d["disc_labels"] * d["input_mask"])
|
161
|
-
return metrics
|
162
|
-
self.eval_metrics = (metric_fn, eval_fn_values)
|
163
|
-
|
164
|
-
def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model):
|
165
|
-
"""Masked language modeling softmax layer."""
|
166
|
-
with tf.variable_scope("generator_predictions"):
|
167
|
-
if self._config.uniform_generator:
|
168
|
-
logits = tf.zeros(self._bert_config.vocab_size)
|
169
|
-
logits_tiled = tf.zeros(
|
170
|
-
modeling.get_shape_list(inputs.masked_lm_ids) +
|
171
|
-
[self._bert_config.vocab_size])
|
172
|
-
logits_tiled += tf.reshape(logits, [1, 1, self._bert_config.vocab_size])
|
173
|
-
logits = logits_tiled
|
174
|
-
else:
|
175
|
-
relevant_reprs = pretrain_helpers.gather_positions(
|
176
|
-
model.get_sequence_output(), inputs.masked_lm_positions)
|
177
|
-
logits = get_token_logits(
|
178
|
-
relevant_reprs, model.get_embedding_table(), self._bert_config)
|
179
|
-
return get_softmax_output(
|
180
|
-
logits, inputs.masked_lm_ids, inputs.masked_lm_weights,
|
181
|
-
self._bert_config.vocab_size)
|
182
|
-
|
183
|
-
def _get_discriminator_output(
|
184
|
-
self, inputs, discriminator, labels, cloze_output=None):
|
185
|
-
"""Discriminator binary classifier."""
|
186
|
-
with tf.variable_scope("discriminator_predictions"):
|
187
|
-
hidden = tf.layers.dense(
|
188
|
-
discriminator.get_sequence_output(),
|
189
|
-
units=self._bert_config.hidden_size,
|
190
|
-
activation=modeling.get_activation(self._bert_config.hidden_act),
|
191
|
-
kernel_initializer=modeling.create_initializer(
|
192
|
-
self._bert_config.initializer_range))
|
193
|
-
logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1)
|
194
|
-
if self._config.electric_objective:
|
195
|
-
log_q = tf.reduce_sum(
|
196
|
-
tf.nn.log_softmax(cloze_output.logits) * tf.one_hot(
|
197
|
-
inputs.input_ids, depth=self._bert_config.vocab_size,
|
198
|
-
dtype=tf.float32), -1)
|
199
|
-
log_q = tf.stop_gradient(log_q)
|
200
|
-
logits += log_q
|
201
|
-
logits += tf.log(self._config.mask_prob / (1 - self._config.mask_prob))
|
202
|
-
|
203
|
-
weights = tf.cast(inputs.input_mask, tf.float32)
|
204
|
-
labelsf = tf.cast(labels, tf.float32)
|
205
|
-
losses = tf.nn.sigmoid_cross_entropy_with_logits(
|
206
|
-
logits=logits, labels=labelsf) * weights
|
207
|
-
per_example_loss = (tf.reduce_sum(losses, axis=-1) /
|
208
|
-
(1e-6 + tf.reduce_sum(weights, axis=-1)))
|
209
|
-
loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
|
210
|
-
probs = tf.nn.sigmoid(logits)
|
211
|
-
preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32)
|
212
|
-
DiscOutput = collections.namedtuple(
|
213
|
-
"DiscOutput", ["loss", "per_example_loss", "probs", "preds",
|
214
|
-
"labels"])
|
215
|
-
return DiscOutput(
|
216
|
-
loss=loss, per_example_loss=per_example_loss, probs=probs,
|
217
|
-
preds=preds, labels=labels,
|
218
|
-
)
|
219
|
-
|
220
|
-
def _get_fake_data(self, inputs, mlm_logits):
|
221
|
-
"""Sample from the generator to create corrupted input."""
|
222
|
-
inputs = pretrain_helpers.unmask(inputs)
|
223
|
-
disallow = tf.one_hot(
|
224
|
-
inputs.masked_lm_ids, depth=self._bert_config.vocab_size,
|
225
|
-
dtype=tf.float32) if self._config.disallow_correct else None
|
226
|
-
sampled_tokens = tf.stop_gradient(pretrain_helpers.sample_from_softmax(
|
227
|
-
mlm_logits / self._config.temperature, disallow=disallow))
|
228
|
-
sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
|
229
|
-
updated_input_ids, masked = pretrain_helpers.scatter_update(
|
230
|
-
inputs.input_ids, sampled_tokids, inputs.masked_lm_positions)
|
231
|
-
if self._config.electric_objective:
|
232
|
-
labels = masked
|
233
|
-
else:
|
234
|
-
labels = masked * (1 - tf.cast(
|
235
|
-
tf.equal(updated_input_ids, inputs.input_ids), tf.int32))
|
236
|
-
updated_inputs = pretrain_data.get_updated_inputs(
|
237
|
-
inputs, input_ids=updated_input_ids)
|
238
|
-
FakedData = collections.namedtuple("FakedData", [
|
239
|
-
"inputs", "is_fake_tokens", "sampled_tokens"])
|
240
|
-
return FakedData(inputs=updated_inputs, is_fake_tokens=labels,
|
241
|
-
sampled_tokens=sampled_tokens)
|
242
|
-
|
243
|
-
def _get_cloze_outputs(self, inputs: pretrain_data.Inputs, model):
|
244
|
-
"""Cloze model softmax layer."""
|
245
|
-
weights = tf.cast(pretrain_helpers.get_candidates_mask(
|
246
|
-
self._config, inputs), tf.float32)
|
247
|
-
with tf.variable_scope("cloze_predictions"):
|
248
|
-
logits = get_token_logits(model.get_sequence_output(),
|
249
|
-
model.get_embedding_table(), self._bert_config)
|
250
|
-
return get_softmax_output(logits, inputs.input_ids, weights,
|
251
|
-
self._bert_config.vocab_size)
|
252
|
-
|
253
|
-
|
254
|
-
def get_token_logits(input_reprs, embedding_table, bert_config):
|
255
|
-
hidden = tf.layers.dense(
|
256
|
-
input_reprs,
|
257
|
-
units=modeling.get_shape_list(embedding_table)[-1],
|
258
|
-
activation=modeling.get_activation(bert_config.hidden_act),
|
259
|
-
kernel_initializer=modeling.create_initializer(
|
260
|
-
bert_config.initializer_range))
|
261
|
-
hidden = modeling.layer_norm(hidden)
|
262
|
-
output_bias = tf.get_variable(
|
263
|
-
"output_bias",
|
264
|
-
shape=[bert_config.vocab_size],
|
265
|
-
initializer=tf.zeros_initializer())
|
266
|
-
logits = tf.matmul(hidden, embedding_table, transpose_b=True)
|
267
|
-
logits = tf.nn.bias_add(logits, output_bias)
|
268
|
-
return logits
|
269
|
-
|
270
|
-
|
271
|
-
def get_softmax_output(logits, targets, weights, vocab_size):
|
272
|
-
oh_labels = tf.one_hot(targets, depth=vocab_size, dtype=tf.float32)
|
273
|
-
preds = tf.argmax(logits, axis=-1, output_type=tf.int32)
|
274
|
-
probs = tf.nn.softmax(logits)
|
275
|
-
log_probs = tf.nn.log_softmax(logits)
|
276
|
-
label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)
|
277
|
-
numerator = tf.reduce_sum(weights * label_log_probs)
|
278
|
-
denominator = tf.reduce_sum(weights) + 1e-6
|
279
|
-
loss = numerator / denominator
|
280
|
-
SoftmaxOutput = collections.namedtuple(
|
281
|
-
"SoftmaxOutput", ["logits", "probs", "loss", "per_example_loss", "preds",
|
282
|
-
"weights"])
|
283
|
-
return SoftmaxOutput(
|
284
|
-
logits=logits, probs=probs, per_example_loss=label_log_probs,
|
285
|
-
loss=loss, preds=preds, weights=weights)
|
286
|
-
|
287
|
-
|
288
|
-
class TwoTowerClozeTransformer(object):
|
289
|
-
"""Build a two-tower Transformer used as Electric's generator."""
|
290
|
-
|
291
|
-
def __init__(self, config, bert_config, inputs: pretrain_data.Inputs,
|
292
|
-
is_training, embedding_size):
|
293
|
-
ltr = build_transformer(
|
294
|
-
config, inputs, is_training, bert_config,
|
295
|
-
untied_embeddings=config.untied_generator_embeddings,
|
296
|
-
embedding_size=(None if config.untied_generator_embeddings
|
297
|
-
else embedding_size),
|
298
|
-
scope="generator_ltr", ltr=True)
|
299
|
-
rtl = build_transformer(
|
300
|
-
config, inputs, is_training, bert_config,
|
301
|
-
untied_embeddings=config.untied_generator_embeddings,
|
302
|
-
embedding_size=(None if config.untied_generator_embeddings
|
303
|
-
else embedding_size),
|
304
|
-
scope="generator_rtl", rtl=True)
|
305
|
-
ltr_reprs = ltr.get_sequence_output()
|
306
|
-
rtl_reprs = rtl.get_sequence_output()
|
307
|
-
self._sequence_output = tf.concat([roll(ltr_reprs, -1),
|
308
|
-
roll(rtl_reprs, 1)], -1)
|
309
|
-
self._embedding_table = ltr.embedding_table
|
310
|
-
|
311
|
-
def get_sequence_output(self):
|
312
|
-
return self._sequence_output
|
313
|
-
|
314
|
-
def get_embedding_table(self):
|
315
|
-
return self._embedding_table
|
316
|
-
|
317
|
-
|
318
|
-
def build_transformer(config: configure_pretraining.PretrainingConfig,
|
319
|
-
inputs: pretrain_data.Inputs, is_training,
|
320
|
-
bert_config, reuse=False, **kwargs):
|
321
|
-
"""Build a transformer encoder network."""
|
322
|
-
with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
|
323
|
-
return modeling.BertModel(
|
324
|
-
bert_config=bert_config,
|
325
|
-
is_training=is_training,
|
326
|
-
input_ids=inputs.input_ids,
|
327
|
-
input_mask=inputs.input_mask,
|
328
|
-
token_type_ids=inputs.segment_ids,
|
329
|
-
use_one_hot_embeddings=config.use_tpu,
|
330
|
-
**kwargs)
|
331
|
-
|
332
|
-
|
333
|
-
def roll(arr, direction):
|
334
|
-
"""Shifts embeddings in a [batch, seq_len, dim] tensor to the right/left."""
|
335
|
-
return tf.concat([arr[:, direction:, :], arr[:, :direction, :]], axis=1)
|
336
|
-
|
337
|
-
|
338
|
-
def get_generator_config(config: configure_pretraining.PretrainingConfig,
|
339
|
-
bert_config: modeling.BertConfig):
|
340
|
-
"""Get model config for the generator network."""
|
341
|
-
gen_config = modeling.BertConfig.from_dict(bert_config.to_dict())
|
342
|
-
gen_config.hidden_size = int(round(
|
343
|
-
bert_config.hidden_size * config.generator_hidden_size))
|
344
|
-
gen_config.num_hidden_layers = int(round(
|
345
|
-
bert_config.num_hidden_layers * config.generator_layers))
|
346
|
-
gen_config.intermediate_size = 4 * gen_config.hidden_size
|
347
|
-
gen_config.num_attention_heads = max(1, gen_config.hidden_size // 64)
|
348
|
-
return gen_config
|
349
|
-
|
350
|
-
|
351
|
-
def model_fn_builder(config: configure_pretraining.PretrainingConfig):
|
352
|
-
"""Build the model for training."""
|
353
|
-
|
354
|
-
def model_fn(features, labels, mode, params):
|
355
|
-
"""Build the model for training."""
|
356
|
-
model = PretrainingModel(config, features,
|
357
|
-
mode == tf.estimator.ModeKeys.TRAIN)
|
358
|
-
utils.log("Model is built!")
|
359
|
-
if mode == tf.estimator.ModeKeys.TRAIN:
|
360
|
-
train_op = optimization.create_optimizer(
|
361
|
-
model.total_loss, config.learning_rate, config.num_train_steps,
|
362
|
-
weight_decay_rate=config.weight_decay_rate,
|
363
|
-
use_tpu=config.use_tpu,
|
364
|
-
warmup_steps=config.num_warmup_steps,
|
365
|
-
lr_decay_power=config.lr_decay_power
|
366
|
-
)
|
367
|
-
output_spec = tf.estimator.tpu.TPUEstimatorSpec(
|
368
|
-
mode=mode,
|
369
|
-
loss=model.total_loss,
|
370
|
-
train_op=train_op,
|
371
|
-
training_hooks=[training_utils.ETAHook(
|
372
|
-
{} if config.use_tpu else dict(loss=model.total_loss),
|
373
|
-
config.num_train_steps, config.iterations_per_loop,
|
374
|
-
config.use_tpu)]
|
375
|
-
)
|
376
|
-
elif mode == tf.estimator.ModeKeys.EVAL:
|
377
|
-
output_spec = tf.estimator.tpu.TPUEstimatorSpec(
|
378
|
-
mode=mode,
|
379
|
-
loss=model.total_loss,
|
380
|
-
eval_metrics=model.eval_metrics,
|
381
|
-
evaluation_hooks=[training_utils.ETAHook(
|
382
|
-
{} if config.use_tpu else dict(loss=model.total_loss),
|
383
|
-
config.num_eval_steps, config.iterations_per_loop,
|
384
|
-
config.use_tpu, is_training=False)])
|
385
|
-
else:
|
386
|
-
raise ValueError("Only TRAIN and EVAL modes are supported")
|
387
|
-
return output_spec
|
388
|
-
|
389
|
-
return model_fn
|
390
|
-
|
391
|
-
|
392
|
-
def train_or_eval(config: configure_pretraining.PretrainingConfig):
|
393
|
-
"""Run pre-training or evaluate the pre-trained model."""
|
394
|
-
if config.do_train == config.do_eval:
|
395
|
-
raise ValueError("Exactly one of `do_train` or `do_eval` must be True.")
|
396
|
-
if config.debug and config.do_train:
|
397
|
-
utils.rmkdir(config.model_dir)
|
398
|
-
utils.heading("Config:")
|
399
|
-
utils.log_config(config)
|
400
|
-
|
401
|
-
is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
|
402
|
-
tpu_cluster_resolver = None
|
403
|
-
if config.use_tpu and config.tpu_name:
|
404
|
-
tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
|
405
|
-
config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
|
406
|
-
tpu_config = tf.estimator.tpu.TPUConfig(
|
407
|
-
iterations_per_loop=config.iterations_per_loop,
|
408
|
-
num_shards=config.num_tpu_cores,
|
409
|
-
tpu_job_name=config.tpu_job_name,
|
410
|
-
per_host_input_for_training=is_per_host)
|
411
|
-
run_config = tf.estimator.tpu.RunConfig(
|
412
|
-
cluster=tpu_cluster_resolver,
|
413
|
-
model_dir=config.model_dir,
|
414
|
-
save_checkpoints_steps=config.save_checkpoints_steps,
|
415
|
-
keep_checkpoint_max=config.keep_checkpoint_max,
|
416
|
-
tpu_config=tpu_config)
|
417
|
-
model_fn = model_fn_builder(config=config)
|
418
|
-
estimator = tf.estimator.tpu.TPUEstimator(
|
419
|
-
use_tpu=config.use_tpu,
|
420
|
-
model_fn=model_fn,
|
421
|
-
config=run_config,
|
422
|
-
train_batch_size=config.train_batch_size,
|
423
|
-
eval_batch_size=config.eval_batch_size)
|
424
|
-
|
425
|
-
if config.do_train:
|
426
|
-
utils.heading("Running training")
|
427
|
-
estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
|
428
|
-
max_steps=config.num_train_steps)
|
429
|
-
if config.do_eval:
|
430
|
-
utils.heading("Running evaluation")
|
431
|
-
result = estimator.evaluate(
|
432
|
-
input_fn=pretrain_data.get_input_fn(config, False),
|
433
|
-
steps=config.num_eval_steps)
|
434
|
-
for key in sorted(result.keys()):
|
435
|
-
utils.log(" {:} = {:}".format(key, str(result[key])))
|
436
|
-
return result
|
437
|
-
|
438
|
-
|
439
|
-
def train_one_step(config: configure_pretraining.PretrainingConfig):
|
440
|
-
"""Builds an ELECTRA model an trains it for one step; useful for debugging."""
|
441
|
-
train_input_fn = pretrain_data.get_input_fn(config, True)
|
442
|
-
features = tf.data.make_one_shot_iterator(train_input_fn(dict(
|
443
|
-
batch_size=config.train_batch_size))).get_next()
|
444
|
-
model = PretrainingModel(config, features, True)
|
445
|
-
with tf.Session() as sess:
|
446
|
-
sess.run(tf.global_variables_initializer())
|
447
|
-
utils.log(sess.run(model.total_loss))
|
448
|
-
|
449
|
-
|
450
|
-
def main():
|
451
|
-
parser = argparse.ArgumentParser(description=__doc__)
|
452
|
-
parser.add_argument("--data-dir", required=True,
|
453
|
-
help="Location of data files (model weights, etc).")
|
454
|
-
parser.add_argument("--model-name", required=True,
|
455
|
-
help="The name of the model being fine-tuned.")
|
456
|
-
parser.add_argument("--hparams", default="{}",
|
457
|
-
help="JSON dict of model hyperparameters.")
|
458
|
-
args = parser.parse_args()
|
459
|
-
if args.hparams.endswith(".json"):
|
460
|
-
hparams = utils.load_json(args.hparams)
|
461
|
-
else:
|
462
|
-
hparams = json.loads(args.hparams)
|
463
|
-
tf.logging.set_verbosity(tf.logging.ERROR)
|
464
|
-
train_or_eval(configure_pretraining.PretrainingConfig(
|
465
|
-
args.model_name, args.data_dir, **hparams))
|
466
|
-
|
467
|
-
|
468
|
-
if __name__ == "__main__":
|
469
|
-
main()
|
@@ -1,14 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
@@ -1,112 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2020 The Google Research Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
"""Utilities for training the models."""
|
17
|
-
|
18
|
-
from __future__ import absolute_import
|
19
|
-
from __future__ import division
|
20
|
-
from __future__ import print_function
|
21
|
-
|
22
|
-
import datetime
|
23
|
-
import re
|
24
|
-
import time
|
25
|
-
import tensorflow as tf
|
26
|
-
|
27
|
-
from model import modeling
|
28
|
-
from util import utils
|
29
|
-
|
30
|
-
|
31
|
-
class ETAHook(tf.estimator.SessionRunHook):
|
32
|
-
"""Print out the time remaining during training/evaluation."""
|
33
|
-
|
34
|
-
def __init__(self, to_log, n_steps, iterations_per_loop, on_tpu,
|
35
|
-
log_every=1, is_training=True):
|
36
|
-
self._to_log = to_log
|
37
|
-
self._n_steps = n_steps
|
38
|
-
self._iterations_per_loop = iterations_per_loop
|
39
|
-
self._on_tpu = on_tpu
|
40
|
-
self._log_every = log_every
|
41
|
-
self._is_training = is_training
|
42
|
-
self._steps_run_so_far = 0
|
43
|
-
self._global_step = None
|
44
|
-
self._global_step_tensor = None
|
45
|
-
self._start_step = None
|
46
|
-
self._start_time = None
|
47
|
-
|
48
|
-
def begin(self):
|
49
|
-
self._global_step_tensor = tf.train.get_or_create_global_step()
|
50
|
-
|
51
|
-
def before_run(self, run_context):
|
52
|
-
if self._start_time is None:
|
53
|
-
self._start_time = time.time()
|
54
|
-
return tf.estimator.SessionRunArgs(self._to_log)
|
55
|
-
|
56
|
-
def after_run(self, run_context, run_values):
|
57
|
-
self._global_step = run_context.session.run(self._global_step_tensor)
|
58
|
-
self._steps_run_so_far += self._iterations_per_loop if self._on_tpu else 1
|
59
|
-
if self._start_step is None:
|
60
|
-
self._start_step = self._global_step - (self._iterations_per_loop
|
61
|
-
if self._on_tpu else 1)
|
62
|
-
self.log(run_values)
|
63
|
-
|
64
|
-
def end(self, session):
|
65
|
-
self._global_step = session.run(self._global_step_tensor)
|
66
|
-
self.log()
|
67
|
-
|
68
|
-
def log(self, run_values=None):
|
69
|
-
step = self._global_step if self._is_training else self._steps_run_so_far
|
70
|
-
if step % self._log_every != 0:
|
71
|
-
return
|
72
|
-
msg = "{:}/{:} = {:.1f}%".format(step, self._n_steps,
|
73
|
-
100.0 * step / self._n_steps)
|
74
|
-
time_elapsed = time.time() - self._start_time
|
75
|
-
time_per_step = time_elapsed / (
|
76
|
-
(step - self._start_step) if self._is_training else step)
|
77
|
-
msg += ", SPS: {:.1f}".format(1 / time_per_step)
|
78
|
-
msg += ", ELAP: " + secs_to_str(time_elapsed)
|
79
|
-
msg += ", ETA: " + secs_to_str(
|
80
|
-
(self._n_steps - step) * time_per_step)
|
81
|
-
if run_values is not None:
|
82
|
-
for tag, value in run_values.results.items():
|
83
|
-
msg += " - " + str(tag) + (": {:.4f}".format(value))
|
84
|
-
utils.log(msg)
|
85
|
-
|
86
|
-
|
87
|
-
def secs_to_str(secs):
|
88
|
-
s = str(datetime.timedelta(seconds=int(round(secs))))
|
89
|
-
s = re.sub("^0:", "", s)
|
90
|
-
s = re.sub("^0", "", s)
|
91
|
-
s = re.sub("^0:", "", s)
|
92
|
-
s = re.sub("^0", "", s)
|
93
|
-
return s
|
94
|
-
|
95
|
-
|
96
|
-
def get_bert_config(config):
|
97
|
-
"""Get model hyperparameters based on a pretraining/finetuning config"""
|
98
|
-
if config.model_size == "large":
|
99
|
-
args = {"hidden_size": 1024, "num_hidden_layers": 24}
|
100
|
-
elif config.model_size == "base":
|
101
|
-
args = {"hidden_size": 768, "num_hidden_layers": 12}
|
102
|
-
elif config.model_size == "small":
|
103
|
-
args = {"hidden_size": 256, "num_hidden_layers": 12}
|
104
|
-
else:
|
105
|
-
raise ValueError("Unknown model size", config.model_size)
|
106
|
-
args["vocab_size"] = config.vocab_size
|
107
|
-
args.update(**config.model_hparam_overrides)
|
108
|
-
# by default the ff size and num attn heads are determined by the hidden size
|
109
|
-
args["num_attention_heads"] = max(1, args["hidden_size"] // 64)
|
110
|
-
args["intermediate_size"] = 4 * args["hidden_size"]
|
111
|
-
args.update(**config.model_hparam_overrides)
|
112
|
-
return modeling.BertConfig.from_dict(args)
|