SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,469 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Pre-trains an ELECTRA model."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import argparse
23
- import collections
24
- import json
25
-
26
- import tensorflow as tf
27
-
28
- import configure_pretraining
29
- from model import modeling
30
- from model import optimization
31
- from pretrain import pretrain_data
32
- from pretrain import pretrain_helpers
33
- from util import training_utils
34
- from util import utils
35
-
36
-
37
- class PretrainingModel(object):
38
- """Transformer pre-training using the replaced-token-detection task."""
39
-
40
- def __init__(self, config: configure_pretraining.PretrainingConfig,
41
- features, is_training):
42
- # Set up model config
43
- self._config = config
44
- self._bert_config = training_utils.get_bert_config(config)
45
- if config.debug:
46
- self._bert_config.num_hidden_layers = 3
47
- self._bert_config.hidden_size = 144
48
- self._bert_config.intermediate_size = 144 * 4
49
- self._bert_config.num_attention_heads = 4
50
-
51
- # Mask the input
52
- unmasked_inputs = pretrain_data.features_to_inputs(features)
53
- masked_inputs = pretrain_helpers.mask(
54
- config, unmasked_inputs, config.mask_prob)
55
-
56
- # Generator
57
- embedding_size = (
58
- self._bert_config.hidden_size if config.embedding_size is None else
59
- config.embedding_size)
60
- cloze_output = None
61
- if config.uniform_generator:
62
- # simple generator sampling fakes uniformly at random
63
- mlm_output = self._get_masked_lm_output(masked_inputs, None)
64
- elif ((config.electra_objective or config.electric_objective)
65
- and config.untied_generator):
66
- generator_config = get_generator_config(config, self._bert_config)
67
- if config.two_tower_generator:
68
- # two-tower cloze model generator used for electric
69
- generator = TwoTowerClozeTransformer(
70
- config, generator_config, unmasked_inputs, is_training,
71
- embedding_size)
72
- cloze_output = self._get_cloze_outputs(unmasked_inputs, generator)
73
- mlm_output = get_softmax_output(
74
- pretrain_helpers.gather_positions(
75
- cloze_output.logits, masked_inputs.masked_lm_positions),
76
- masked_inputs.masked_lm_ids, masked_inputs.masked_lm_weights,
77
- self._bert_config.vocab_size)
78
- else:
79
- # small masked language model generator
80
- generator = build_transformer(
81
- config, masked_inputs, is_training, generator_config,
82
- embedding_size=(None if config.untied_generator_embeddings
83
- else embedding_size),
84
- untied_embeddings=config.untied_generator_embeddings,
85
- scope="generator")
86
- mlm_output = self._get_masked_lm_output(masked_inputs, generator)
87
- else:
88
- # full-sized masked language model generator if using BERT objective or if
89
- # the generator and discriminator have tied weights
90
- generator = build_transformer(
91
- config, masked_inputs, is_training, self._bert_config,
92
- embedding_size=embedding_size)
93
- mlm_output = self._get_masked_lm_output(masked_inputs, generator)
94
- fake_data = self._get_fake_data(masked_inputs, mlm_output.logits)
95
- self.mlm_output = mlm_output
96
- self.total_loss = config.gen_weight * (
97
- cloze_output.loss if config.two_tower_generator else mlm_output.loss)
98
-
99
- # Discriminator
100
- disc_output = None
101
- if config.electra_objective or config.electric_objective:
102
- discriminator = build_transformer(
103
- config, fake_data.inputs, is_training, self._bert_config,
104
- reuse=not config.untied_generator, embedding_size=embedding_size)
105
- disc_output = self._get_discriminator_output(
106
- fake_data.inputs, discriminator, fake_data.is_fake_tokens,
107
- cloze_output)
108
- self.total_loss += config.disc_weight * disc_output.loss
109
-
110
- # Evaluation
111
- eval_fn_inputs = {
112
- "input_ids": masked_inputs.input_ids,
113
- "masked_lm_preds": mlm_output.preds,
114
- "mlm_loss": mlm_output.per_example_loss,
115
- "masked_lm_ids": masked_inputs.masked_lm_ids,
116
- "masked_lm_weights": masked_inputs.masked_lm_weights,
117
- "input_mask": masked_inputs.input_mask
118
- }
119
- if config.electra_objective or config.electric_objective:
120
- eval_fn_inputs.update({
121
- "disc_loss": disc_output.per_example_loss,
122
- "disc_labels": disc_output.labels,
123
- "disc_probs": disc_output.probs,
124
- "disc_preds": disc_output.preds,
125
- "sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1,
126
- output_type=tf.int32)
127
- })
128
- eval_fn_keys = eval_fn_inputs.keys()
129
- eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys]
130
-
131
- def metric_fn(*args):
132
- """Computes the loss and accuracy of the model."""
133
- d = {k: arg for k, arg in zip(eval_fn_keys, args)}
134
- metrics = dict()
135
- metrics["masked_lm_accuracy"] = tf.metrics.accuracy(
136
- labels=tf.reshape(d["masked_lm_ids"], [-1]),
137
- predictions=tf.reshape(d["masked_lm_preds"], [-1]),
138
- weights=tf.reshape(d["masked_lm_weights"], [-1]))
139
- metrics["masked_lm_loss"] = tf.metrics.mean(
140
- values=tf.reshape(d["mlm_loss"], [-1]),
141
- weights=tf.reshape(d["masked_lm_weights"], [-1]))
142
- if config.electra_objective or config.electric_objective:
143
- metrics["sampled_masked_lm_accuracy"] = tf.metrics.accuracy(
144
- labels=tf.reshape(d["masked_lm_ids"], [-1]),
145
- predictions=tf.reshape(d["sampled_tokids"], [-1]),
146
- weights=tf.reshape(d["masked_lm_weights"], [-1]))
147
- if config.disc_weight > 0:
148
- metrics["disc_loss"] = tf.metrics.mean(d["disc_loss"])
149
- metrics["disc_auc"] = tf.metrics.auc(
150
- d["disc_labels"] * d["input_mask"],
151
- d["disc_probs"] * tf.cast(d["input_mask"], tf.float32))
152
- metrics["disc_accuracy"] = tf.metrics.accuracy(
153
- labels=d["disc_labels"], predictions=d["disc_preds"],
154
- weights=d["input_mask"])
155
- metrics["disc_precision"] = tf.metrics.accuracy(
156
- labels=d["disc_labels"], predictions=d["disc_preds"],
157
- weights=d["disc_preds"] * d["input_mask"])
158
- metrics["disc_recall"] = tf.metrics.accuracy(
159
- labels=d["disc_labels"], predictions=d["disc_preds"],
160
- weights=d["disc_labels"] * d["input_mask"])
161
- return metrics
162
- self.eval_metrics = (metric_fn, eval_fn_values)
163
-
164
- def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model):
165
- """Masked language modeling softmax layer."""
166
- with tf.variable_scope("generator_predictions"):
167
- if self._config.uniform_generator:
168
- logits = tf.zeros(self._bert_config.vocab_size)
169
- logits_tiled = tf.zeros(
170
- modeling.get_shape_list(inputs.masked_lm_ids) +
171
- [self._bert_config.vocab_size])
172
- logits_tiled += tf.reshape(logits, [1, 1, self._bert_config.vocab_size])
173
- logits = logits_tiled
174
- else:
175
- relevant_reprs = pretrain_helpers.gather_positions(
176
- model.get_sequence_output(), inputs.masked_lm_positions)
177
- logits = get_token_logits(
178
- relevant_reprs, model.get_embedding_table(), self._bert_config)
179
- return get_softmax_output(
180
- logits, inputs.masked_lm_ids, inputs.masked_lm_weights,
181
- self._bert_config.vocab_size)
182
-
183
- def _get_discriminator_output(
184
- self, inputs, discriminator, labels, cloze_output=None):
185
- """Discriminator binary classifier."""
186
- with tf.variable_scope("discriminator_predictions"):
187
- hidden = tf.layers.dense(
188
- discriminator.get_sequence_output(),
189
- units=self._bert_config.hidden_size,
190
- activation=modeling.get_activation(self._bert_config.hidden_act),
191
- kernel_initializer=modeling.create_initializer(
192
- self._bert_config.initializer_range))
193
- logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1)
194
- if self._config.electric_objective:
195
- log_q = tf.reduce_sum(
196
- tf.nn.log_softmax(cloze_output.logits) * tf.one_hot(
197
- inputs.input_ids, depth=self._bert_config.vocab_size,
198
- dtype=tf.float32), -1)
199
- log_q = tf.stop_gradient(log_q)
200
- logits += log_q
201
- logits += tf.log(self._config.mask_prob / (1 - self._config.mask_prob))
202
-
203
- weights = tf.cast(inputs.input_mask, tf.float32)
204
- labelsf = tf.cast(labels, tf.float32)
205
- losses = tf.nn.sigmoid_cross_entropy_with_logits(
206
- logits=logits, labels=labelsf) * weights
207
- per_example_loss = (tf.reduce_sum(losses, axis=-1) /
208
- (1e-6 + tf.reduce_sum(weights, axis=-1)))
209
- loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
210
- probs = tf.nn.sigmoid(logits)
211
- preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32)
212
- DiscOutput = collections.namedtuple(
213
- "DiscOutput", ["loss", "per_example_loss", "probs", "preds",
214
- "labels"])
215
- return DiscOutput(
216
- loss=loss, per_example_loss=per_example_loss, probs=probs,
217
- preds=preds, labels=labels,
218
- )
219
-
220
- def _get_fake_data(self, inputs, mlm_logits):
221
- """Sample from the generator to create corrupted input."""
222
- inputs = pretrain_helpers.unmask(inputs)
223
- disallow = tf.one_hot(
224
- inputs.masked_lm_ids, depth=self._bert_config.vocab_size,
225
- dtype=tf.float32) if self._config.disallow_correct else None
226
- sampled_tokens = tf.stop_gradient(pretrain_helpers.sample_from_softmax(
227
- mlm_logits / self._config.temperature, disallow=disallow))
228
- sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
229
- updated_input_ids, masked = pretrain_helpers.scatter_update(
230
- inputs.input_ids, sampled_tokids, inputs.masked_lm_positions)
231
- if self._config.electric_objective:
232
- labels = masked
233
- else:
234
- labels = masked * (1 - tf.cast(
235
- tf.equal(updated_input_ids, inputs.input_ids), tf.int32))
236
- updated_inputs = pretrain_data.get_updated_inputs(
237
- inputs, input_ids=updated_input_ids)
238
- FakedData = collections.namedtuple("FakedData", [
239
- "inputs", "is_fake_tokens", "sampled_tokens"])
240
- return FakedData(inputs=updated_inputs, is_fake_tokens=labels,
241
- sampled_tokens=sampled_tokens)
242
-
243
- def _get_cloze_outputs(self, inputs: pretrain_data.Inputs, model):
244
- """Cloze model softmax layer."""
245
- weights = tf.cast(pretrain_helpers.get_candidates_mask(
246
- self._config, inputs), tf.float32)
247
- with tf.variable_scope("cloze_predictions"):
248
- logits = get_token_logits(model.get_sequence_output(),
249
- model.get_embedding_table(), self._bert_config)
250
- return get_softmax_output(logits, inputs.input_ids, weights,
251
- self._bert_config.vocab_size)
252
-
253
-
254
- def get_token_logits(input_reprs, embedding_table, bert_config):
255
- hidden = tf.layers.dense(
256
- input_reprs,
257
- units=modeling.get_shape_list(embedding_table)[-1],
258
- activation=modeling.get_activation(bert_config.hidden_act),
259
- kernel_initializer=modeling.create_initializer(
260
- bert_config.initializer_range))
261
- hidden = modeling.layer_norm(hidden)
262
- output_bias = tf.get_variable(
263
- "output_bias",
264
- shape=[bert_config.vocab_size],
265
- initializer=tf.zeros_initializer())
266
- logits = tf.matmul(hidden, embedding_table, transpose_b=True)
267
- logits = tf.nn.bias_add(logits, output_bias)
268
- return logits
269
-
270
-
271
- def get_softmax_output(logits, targets, weights, vocab_size):
272
- oh_labels = tf.one_hot(targets, depth=vocab_size, dtype=tf.float32)
273
- preds = tf.argmax(logits, axis=-1, output_type=tf.int32)
274
- probs = tf.nn.softmax(logits)
275
- log_probs = tf.nn.log_softmax(logits)
276
- label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)
277
- numerator = tf.reduce_sum(weights * label_log_probs)
278
- denominator = tf.reduce_sum(weights) + 1e-6
279
- loss = numerator / denominator
280
- SoftmaxOutput = collections.namedtuple(
281
- "SoftmaxOutput", ["logits", "probs", "loss", "per_example_loss", "preds",
282
- "weights"])
283
- return SoftmaxOutput(
284
- logits=logits, probs=probs, per_example_loss=label_log_probs,
285
- loss=loss, preds=preds, weights=weights)
286
-
287
-
288
- class TwoTowerClozeTransformer(object):
289
- """Build a two-tower Transformer used as Electric's generator."""
290
-
291
- def __init__(self, config, bert_config, inputs: pretrain_data.Inputs,
292
- is_training, embedding_size):
293
- ltr = build_transformer(
294
- config, inputs, is_training, bert_config,
295
- untied_embeddings=config.untied_generator_embeddings,
296
- embedding_size=(None if config.untied_generator_embeddings
297
- else embedding_size),
298
- scope="generator_ltr", ltr=True)
299
- rtl = build_transformer(
300
- config, inputs, is_training, bert_config,
301
- untied_embeddings=config.untied_generator_embeddings,
302
- embedding_size=(None if config.untied_generator_embeddings
303
- else embedding_size),
304
- scope="generator_rtl", rtl=True)
305
- ltr_reprs = ltr.get_sequence_output()
306
- rtl_reprs = rtl.get_sequence_output()
307
- self._sequence_output = tf.concat([roll(ltr_reprs, -1),
308
- roll(rtl_reprs, 1)], -1)
309
- self._embedding_table = ltr.embedding_table
310
-
311
- def get_sequence_output(self):
312
- return self._sequence_output
313
-
314
- def get_embedding_table(self):
315
- return self._embedding_table
316
-
317
-
318
- def build_transformer(config: configure_pretraining.PretrainingConfig,
319
- inputs: pretrain_data.Inputs, is_training,
320
- bert_config, reuse=False, **kwargs):
321
- """Build a transformer encoder network."""
322
- with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
323
- return modeling.BertModel(
324
- bert_config=bert_config,
325
- is_training=is_training,
326
- input_ids=inputs.input_ids,
327
- input_mask=inputs.input_mask,
328
- token_type_ids=inputs.segment_ids,
329
- use_one_hot_embeddings=config.use_tpu,
330
- **kwargs)
331
-
332
-
333
- def roll(arr, direction):
334
- """Shifts embeddings in a [batch, seq_len, dim] tensor to the right/left."""
335
- return tf.concat([arr[:, direction:, :], arr[:, :direction, :]], axis=1)
336
-
337
-
338
- def get_generator_config(config: configure_pretraining.PretrainingConfig,
339
- bert_config: modeling.BertConfig):
340
- """Get model config for the generator network."""
341
- gen_config = modeling.BertConfig.from_dict(bert_config.to_dict())
342
- gen_config.hidden_size = int(round(
343
- bert_config.hidden_size * config.generator_hidden_size))
344
- gen_config.num_hidden_layers = int(round(
345
- bert_config.num_hidden_layers * config.generator_layers))
346
- gen_config.intermediate_size = 4 * gen_config.hidden_size
347
- gen_config.num_attention_heads = max(1, gen_config.hidden_size // 64)
348
- return gen_config
349
-
350
-
351
- def model_fn_builder(config: configure_pretraining.PretrainingConfig):
352
- """Build the model for training."""
353
-
354
- def model_fn(features, labels, mode, params):
355
- """Build the model for training."""
356
- model = PretrainingModel(config, features,
357
- mode == tf.estimator.ModeKeys.TRAIN)
358
- utils.log("Model is built!")
359
- if mode == tf.estimator.ModeKeys.TRAIN:
360
- train_op = optimization.create_optimizer(
361
- model.total_loss, config.learning_rate, config.num_train_steps,
362
- weight_decay_rate=config.weight_decay_rate,
363
- use_tpu=config.use_tpu,
364
- warmup_steps=config.num_warmup_steps,
365
- lr_decay_power=config.lr_decay_power
366
- )
367
- output_spec = tf.estimator.tpu.TPUEstimatorSpec(
368
- mode=mode,
369
- loss=model.total_loss,
370
- train_op=train_op,
371
- training_hooks=[training_utils.ETAHook(
372
- {} if config.use_tpu else dict(loss=model.total_loss),
373
- config.num_train_steps, config.iterations_per_loop,
374
- config.use_tpu)]
375
- )
376
- elif mode == tf.estimator.ModeKeys.EVAL:
377
- output_spec = tf.estimator.tpu.TPUEstimatorSpec(
378
- mode=mode,
379
- loss=model.total_loss,
380
- eval_metrics=model.eval_metrics,
381
- evaluation_hooks=[training_utils.ETAHook(
382
- {} if config.use_tpu else dict(loss=model.total_loss),
383
- config.num_eval_steps, config.iterations_per_loop,
384
- config.use_tpu, is_training=False)])
385
- else:
386
- raise ValueError("Only TRAIN and EVAL modes are supported")
387
- return output_spec
388
-
389
- return model_fn
390
-
391
-
392
- def train_or_eval(config: configure_pretraining.PretrainingConfig):
393
- """Run pre-training or evaluate the pre-trained model."""
394
- if config.do_train == config.do_eval:
395
- raise ValueError("Exactly one of `do_train` or `do_eval` must be True.")
396
- if config.debug and config.do_train:
397
- utils.rmkdir(config.model_dir)
398
- utils.heading("Config:")
399
- utils.log_config(config)
400
-
401
- is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
402
- tpu_cluster_resolver = None
403
- if config.use_tpu and config.tpu_name:
404
- tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
405
- config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
406
- tpu_config = tf.estimator.tpu.TPUConfig(
407
- iterations_per_loop=config.iterations_per_loop,
408
- num_shards=config.num_tpu_cores,
409
- tpu_job_name=config.tpu_job_name,
410
- per_host_input_for_training=is_per_host)
411
- run_config = tf.estimator.tpu.RunConfig(
412
- cluster=tpu_cluster_resolver,
413
- model_dir=config.model_dir,
414
- save_checkpoints_steps=config.save_checkpoints_steps,
415
- keep_checkpoint_max=config.keep_checkpoint_max,
416
- tpu_config=tpu_config)
417
- model_fn = model_fn_builder(config=config)
418
- estimator = tf.estimator.tpu.TPUEstimator(
419
- use_tpu=config.use_tpu,
420
- model_fn=model_fn,
421
- config=run_config,
422
- train_batch_size=config.train_batch_size,
423
- eval_batch_size=config.eval_batch_size)
424
-
425
- if config.do_train:
426
- utils.heading("Running training")
427
- estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
428
- max_steps=config.num_train_steps)
429
- if config.do_eval:
430
- utils.heading("Running evaluation")
431
- result = estimator.evaluate(
432
- input_fn=pretrain_data.get_input_fn(config, False),
433
- steps=config.num_eval_steps)
434
- for key in sorted(result.keys()):
435
- utils.log(" {:} = {:}".format(key, str(result[key])))
436
- return result
437
-
438
-
439
- def train_one_step(config: configure_pretraining.PretrainingConfig):
440
- """Builds an ELECTRA model an trains it for one step; useful for debugging."""
441
- train_input_fn = pretrain_data.get_input_fn(config, True)
442
- features = tf.data.make_one_shot_iterator(train_input_fn(dict(
443
- batch_size=config.train_batch_size))).get_next()
444
- model = PretrainingModel(config, features, True)
445
- with tf.Session() as sess:
446
- sess.run(tf.global_variables_initializer())
447
- utils.log(sess.run(model.total_loss))
448
-
449
-
450
- def main():
451
- parser = argparse.ArgumentParser(description=__doc__)
452
- parser.add_argument("--data-dir", required=True,
453
- help="Location of data files (model weights, etc).")
454
- parser.add_argument("--model-name", required=True,
455
- help="The name of the model being fine-tuned.")
456
- parser.add_argument("--hparams", default="{}",
457
- help="JSON dict of model hyperparameters.")
458
- args = parser.parse_args()
459
- if args.hparams.endswith(".json"):
460
- hparams = utils.load_json(args.hparams)
461
- else:
462
- hparams = json.loads(args.hparams)
463
- tf.logging.set_verbosity(tf.logging.ERROR)
464
- train_or_eval(configure_pretraining.PretrainingConfig(
465
- args.model_name, args.data_dir, **hparams))
466
-
467
-
468
- if __name__ == "__main__":
469
- main()
@@ -1,14 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
@@ -1,112 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 The Google Research Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Utilities for training the models."""
17
-
18
- from __future__ import absolute_import
19
- from __future__ import division
20
- from __future__ import print_function
21
-
22
- import datetime
23
- import re
24
- import time
25
- import tensorflow as tf
26
-
27
- from model import modeling
28
- from util import utils
29
-
30
-
31
- class ETAHook(tf.estimator.SessionRunHook):
32
- """Print out the time remaining during training/evaluation."""
33
-
34
- def __init__(self, to_log, n_steps, iterations_per_loop, on_tpu,
35
- log_every=1, is_training=True):
36
- self._to_log = to_log
37
- self._n_steps = n_steps
38
- self._iterations_per_loop = iterations_per_loop
39
- self._on_tpu = on_tpu
40
- self._log_every = log_every
41
- self._is_training = is_training
42
- self._steps_run_so_far = 0
43
- self._global_step = None
44
- self._global_step_tensor = None
45
- self._start_step = None
46
- self._start_time = None
47
-
48
- def begin(self):
49
- self._global_step_tensor = tf.train.get_or_create_global_step()
50
-
51
- def before_run(self, run_context):
52
- if self._start_time is None:
53
- self._start_time = time.time()
54
- return tf.estimator.SessionRunArgs(self._to_log)
55
-
56
- def after_run(self, run_context, run_values):
57
- self._global_step = run_context.session.run(self._global_step_tensor)
58
- self._steps_run_so_far += self._iterations_per_loop if self._on_tpu else 1
59
- if self._start_step is None:
60
- self._start_step = self._global_step - (self._iterations_per_loop
61
- if self._on_tpu else 1)
62
- self.log(run_values)
63
-
64
- def end(self, session):
65
- self._global_step = session.run(self._global_step_tensor)
66
- self.log()
67
-
68
- def log(self, run_values=None):
69
- step = self._global_step if self._is_training else self._steps_run_so_far
70
- if step % self._log_every != 0:
71
- return
72
- msg = "{:}/{:} = {:.1f}%".format(step, self._n_steps,
73
- 100.0 * step / self._n_steps)
74
- time_elapsed = time.time() - self._start_time
75
- time_per_step = time_elapsed / (
76
- (step - self._start_step) if self._is_training else step)
77
- msg += ", SPS: {:.1f}".format(1 / time_per_step)
78
- msg += ", ELAP: " + secs_to_str(time_elapsed)
79
- msg += ", ETA: " + secs_to_str(
80
- (self._n_steps - step) * time_per_step)
81
- if run_values is not None:
82
- for tag, value in run_values.results.items():
83
- msg += " - " + str(tag) + (": {:.4f}".format(value))
84
- utils.log(msg)
85
-
86
-
87
- def secs_to_str(secs):
88
- s = str(datetime.timedelta(seconds=int(round(secs))))
89
- s = re.sub("^0:", "", s)
90
- s = re.sub("^0", "", s)
91
- s = re.sub("^0:", "", s)
92
- s = re.sub("^0", "", s)
93
- return s
94
-
95
-
96
- def get_bert_config(config):
97
- """Get model hyperparameters based on a pretraining/finetuning config"""
98
- if config.model_size == "large":
99
- args = {"hidden_size": 1024, "num_hidden_layers": 24}
100
- elif config.model_size == "base":
101
- args = {"hidden_size": 768, "num_hidden_layers": 12}
102
- elif config.model_size == "small":
103
- args = {"hidden_size": 256, "num_hidden_layers": 12}
104
- else:
105
- raise ValueError("Unknown model size", config.model_size)
106
- args["vocab_size"] = config.vocab_size
107
- args.update(**config.model_hparam_overrides)
108
- # by default the ff size and num attn heads are determined by the hidden size
109
- args["num_attention_heads"] = max(1, args["hidden_size"] // 64)
110
- args["intermediate_size"] = 4 * args["hidden_size"]
111
- args.update(**config.model_hparam_overrides)
112
- return modeling.BertConfig.from_dict(args)