SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,234 +0,0 @@
1
- # Original work Copyright 2018 The Google AI Language Team Authors.
2
- # Modified work Copyright 2019 Rowan Zellers
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- import re
16
- import tensorflow as tf
17
- from grover.utils import get_shape_list
18
-
19
-
20
- def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
21
- """Creates an optimizer training op."""
22
- global_step = tf.train.get_or_create_global_step()
23
-
24
- learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
25
-
26
- # Implements linear decay of the learning rate.
27
- learning_rate = tf.train.polynomial_decay(
28
- learning_rate,
29
- global_step,
30
- num_train_steps,
31
- end_learning_rate=0.0,
32
- power=1.0,
33
- cycle=False)
34
-
35
- # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
36
- # learning rate will be `global_step/num_warmup_steps * init_lr`.
37
- if num_warmup_steps:
38
- global_steps_int = tf.cast(global_step, tf.int32)
39
- warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
40
-
41
- global_steps_float = tf.cast(global_steps_int, tf.float32)
42
- warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
43
-
44
- warmup_percent_done = global_steps_float / warmup_steps_float
45
- warmup_learning_rate = init_lr * warmup_percent_done
46
-
47
- is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
48
- learning_rate = (
49
- (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
50
-
51
- # It is recommended that you use this optimizer for fine tuning, since this
52
- # is how the model was trained (note that the Adam m/v variables are NOT
53
- # loaded from init_checkpoint.)
54
- optimizer = AdaFactorOptimizer(
55
- learning_rate=learning_rate,
56
- weight_decay_rate=0.01,
57
- beta_1=0.9,
58
- beta_2=0.999,
59
- epsilon=1e-6,
60
- exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
61
-
62
- if use_tpu:
63
- optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
64
-
65
- tvars = tf.trainable_variables()
66
- grads = tf.gradients(loss, tvars)
67
-
68
- # You could do this, but instead we don't because a) it's slow and b) we already did the 'update clipping'
69
- # (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
70
-
71
- train_op = optimizer.apply_gradients(
72
- zip(grads, tvars), global_step=global_step)
73
-
74
- # Normally the global step update is done inside of `apply_gradients`.
75
- # However, `AdaFactorOptimizer` doesn't do this. But if you use
76
- # a different optimizer, you should probably take this line out.
77
- new_global_step = global_step + 1
78
- train_op = tf.group(train_op, [global_step.assign(new_global_step)])
79
-
80
- train_metrics = {
81
- 'learning_rate': learning_rate,
82
- 'minibatch_loss': loss,
83
- # 'minibatch_ppl': tf.math.exp(loss),
84
- }
85
- return train_op, train_metrics
86
-
87
-
88
- class AdaFactorOptimizer(tf.compat.v1.train.Optimizer):
89
- """here's the optimizer we'll use"""
90
-
91
- def __init__(self,
92
- learning_rate,
93
- weight_decay_rate=0.0,
94
- beta_1=0.9,
95
- beta_2=0.999,
96
- epsilon=1e-6,
97
- exclude_from_weight_decay=None,
98
- clipping_rate=1.0,
99
- name="AdaFactorOptimizer"):
100
- """Constructs a AdaFactorOptimizer."""
101
- super(AdaFactorOptimizer, self).__init__(False, name)
102
-
103
- self.learning_rate = learning_rate
104
- self.weight_decay_rate = weight_decay_rate
105
- self.beta_1 = beta_1
106
- self.beta_2 = beta_2
107
- self.epsilon = epsilon
108
- self.epsilon1 = 1e-30
109
- self.epsilon2 = 0.001
110
- self.clipping_rate = clipping_rate
111
- self.exclude_from_weight_decay = exclude_from_weight_decay
112
- self.use_locking = False
113
-
114
- def _use_factored(self, shape):
115
- return len(shape) >= 2
116
-
117
- def _parameter_scale(self, var):
118
- """Estimate the scale of the parameters from the current values.
119
- We include a minimum value of 0.001 to give it a chance to escape 0
120
- if it was zero-initialized.
121
- Instead of using the value, we could impute the scale from the shape,
122
- as initializers do.
123
- Args:
124
- var: a variable or Tensor.
125
- Returns:
126
- a Scalar
127
- """
128
- return tf.maximum(reduce_rms(var), self.epsilon2)
129
-
130
- def apply_gradients(self, grads_and_vars, global_step=None, name=None):
131
- """See base class."""
132
- assignments = []
133
- for (grad, param) in grads_and_vars:
134
- if grad is None or param is None:
135
- continue
136
-
137
- param_name = self._get_variable_name(param.name)
138
- shape_list = get_shape_list(param, expected_rank=[1, 2])
139
-
140
- # decay_rate = 1 - tf.pow(tf.cast(tf.train.get_or_create_global_step(), tf.float32) + 1.0, -0.8)
141
- decay_rate = self.beta_2
142
- grad_squared = tf.square(grad) + self.epsilon1
143
-
144
- update_scale = self.learning_rate
145
- # update_scale = self.learning_rate * tf.cast(self._parameter_scale(param), dtype=tf.float32)
146
-
147
- # HACK: Make things dependent on grad.
148
- # This confounds the XLA rewriter and keeps it from fusing computations
149
- # across different variables. This fusion is a bad for HBM usage, since
150
- # it causes the gradients to persist in memory.
151
- grad_squared_mean = tf.reduce_mean(grad_squared)
152
- decay_rate += grad_squared_mean * 1e-30
153
- update_scale += grad_squared_mean * 1e-30
154
-
155
- # END HACK
156
-
157
- if self._use_factored(shape_list):
158
- num_rows, num_columns = shape_list
159
-
160
- vr = tf.get_variable(
161
- name=param_name + "/adafactor_vr",
162
- shape=[num_rows],
163
- dtype=tf.float32,
164
- trainable=False,
165
- initializer=tf.zeros_initializer())
166
- vc = tf.get_variable(
167
- name=param_name + "/adafactor_vc",
168
- shape=[num_columns],
169
- dtype=tf.float32,
170
- trainable=False,
171
- initializer=tf.zeros_initializer())
172
-
173
- next_vr = decay_rate * vr + (1 - decay_rate) * tf.reduce_mean(grad_squared, 1)
174
- next_vc = decay_rate * vc + (1 - decay_rate) * tf.reduce_mean(grad_squared, 0)
175
-
176
- long_term_mean = tf.reduce_mean(next_vr, -1, keepdims=True)
177
- r_factor = tf.rsqrt(next_vr / long_term_mean + self.epsilon1)
178
- c_factor = tf.rsqrt(next_vc + self.epsilon1)
179
- update = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(c_factor, -2)
180
-
181
- assignments.append(vr.assign(next_vr, use_locking=self.use_locking))
182
- assignments.append(vc.assign(next_vc, use_locking=self.use_locking))
183
- else:
184
- v = tf.get_variable(
185
- name=param_name + "/adafactor_v",
186
- shape=shape_list,
187
- dtype=tf.float32,
188
- trainable=False,
189
- initializer=tf.zeros_initializer())
190
- next_v = decay_rate * v + (1 - decay_rate) * grad_squared
191
-
192
- assignments.append(v.assign(next_v, use_locking=self.use_locking))
193
- update = grad * tf.rsqrt(next_v + self.epsilon1)
194
-
195
- clipping_denom = tf.maximum(1.0, reduce_rms(update) / self.clipping_rate)
196
- update /= clipping_denom
197
-
198
- # Do weight decay
199
- # Just adding the square of the weights to the loss function is *not*
200
- # the correct way of using L2 regularization/weight decay with Adam,
201
- # since that will interact with the m and v parameters in strange ways.
202
- #
203
- # Instead we want ot decay the weights in a manner that doesn't interact
204
- # with the m/v parameters. This is equivalent to adding the square
205
- # # of the weights to the loss with plain (non-momentum) SGD.
206
- if self._do_use_weight_decay(param_name):
207
- update += self.weight_decay_rate * param
208
-
209
- update_with_lr = update_scale * update
210
- next_param = param - update_with_lr
211
-
212
- assignments.append(param.assign(next_param, use_locking=self.use_locking))
213
- return tf.group(*assignments, name=name)
214
-
215
- def _do_use_weight_decay(self, param_name):
216
- """Whether to use L2 weight decay for `param_name`."""
217
- if not self.weight_decay_rate:
218
- return False
219
- if self.exclude_from_weight_decay:
220
- for r in self.exclude_from_weight_decay:
221
- if re.search(r, param_name) is not None:
222
- return False
223
- return True
224
-
225
- def _get_variable_name(self, param_name):
226
- """Get the variable name from the tensor name."""
227
- m = re.match("^(.*):\\d+$", param_name)
228
- if m is not None:
229
- param_name = m.group(1)
230
- return param_name
231
-
232
-
233
- def reduce_rms(x):
234
- return tf.sqrt(tf.reduce_mean(tf.square(x)))
@@ -1,187 +0,0 @@
1
- # Original work Copyright 2018 The Google AI Language Team Authors.
2
- # Modified work Copyright 2019 Rowan Zellers
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """ Training script! From https://github.com/imcaspar/gpt2-ml"""
17
-
18
- import tensorflow as tf
19
- import os
20
-
21
- from grover.dataloader import input_fn_builder
22
- from grover.modeling import model_fn_builder, GroverConfig
23
-
24
- flags = tf.flags
25
-
26
- FLAGS = flags.FLAGS
27
-
28
- ## Required parameters
29
- flags.DEFINE_string(
30
- "config_file", 'configs/base.json',
31
- "The config json file corresponding to the pre-trained news model. "
32
- "This specifies the model architecture.")
33
-
34
- flags.DEFINE_string(
35
- "input_file", None,
36
- "Input TF example files (can be a glob or comma separated).")
37
-
38
- flags.DEFINE_string(
39
- "output_dir", None,
40
- "The output directory where the model checkpoints will be written.")
41
-
42
- ## Other parameters
43
- flags.DEFINE_string(
44
- "init_checkpoint", None,
45
- "Initial checkpoint (usually from a pre-trained model).")
46
-
47
- flags.DEFINE_integer(
48
- "max_seq_length", 1024,
49
- "The maximum total input sequence length after BPE tokenization. "
50
- "Sequences longer than this will be truncated, and sequences shorter "
51
- "than this will be padded. Must match data generation.")
52
-
53
- flags.DEFINE_bool("do_train", False, "Whether to run training.")
54
-
55
- flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
56
-
57
- flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
58
-
59
- flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for evaluation.")
60
-
61
- flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for adafactor.")
62
-
63
- flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.")
64
-
65
- flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
66
-
67
- flags.DEFINE_integer("save_checkpoints_steps", 1000,
68
- "How often to save the model checkpoint.")
69
-
70
- flags.DEFINE_integer("iterations_per_loop", 1000,
71
- "How many steps to make in each estimator call.")
72
-
73
- flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
74
-
75
- flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
76
-
77
- flags.DEFINE_string(
78
- "tpu_name", None,
79
- "The Cloud TPU to use for training. This should be either the name "
80
- "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
81
- "url.")
82
-
83
- flags.DEFINE_string(
84
- "tpu_zone", None,
85
- "[Optional] GCE zone where the Cloud TPU is located in. If not "
86
- "specified, we will attempt to automatically detect the GCE project from "
87
- "metadata.")
88
-
89
- flags.DEFINE_string(
90
- "gcp_project", None,
91
- "[Optional] Project name for the Cloud TPU-enabled project. If not "
92
- "specified, we will attempt to automatically detect the GCE project from "
93
- "metadata.")
94
-
95
- flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
96
-
97
- flags.DEFINE_integer(
98
- "num_tpu_cores", 8,
99
- "Only used if `use_tpu` is True. Total number of TPU cores to use.")
100
-
101
-
102
- def main(_):
103
- tf.logging.set_verbosity(tf.logging.INFO)
104
- logger = tf.get_logger()
105
- logger.propagate = False
106
-
107
- news_config = GroverConfig.from_json_file(FLAGS.config_file)
108
-
109
- tf.gfile.MakeDirs(FLAGS.output_dir)
110
-
111
- input_files = []
112
- for input_pattern in FLAGS.input_file.split(","):
113
- input_files.extend(tf.gfile.Glob(input_pattern))
114
-
115
- # tf.logging.info("*** Input Files ***")
116
- # for input_file in input_files:
117
- # tf.logging.info(" %s" % input_file)
118
-
119
- tpu_cluster_resolver = None
120
- if FLAGS.use_tpu and FLAGS.tpu_name:
121
- tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
122
- FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
123
-
124
- is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
125
- run_config = tf.contrib.tpu.RunConfig(
126
- cluster=tpu_cluster_resolver,
127
- master=FLAGS.master,
128
- model_dir=FLAGS.output_dir,
129
- save_checkpoints_steps=FLAGS.save_checkpoints_steps,
130
- keep_checkpoint_max=None,
131
- tpu_config=tf.contrib.tpu.TPUConfig(
132
- iterations_per_loop=FLAGS.iterations_per_loop,
133
- num_shards=FLAGS.num_tpu_cores,
134
- per_host_input_for_training=is_per_host))
135
-
136
- model_fn = model_fn_builder(news_config, init_checkpoint=FLAGS.init_checkpoint,
137
- learning_rate=FLAGS.learning_rate,
138
- num_train_steps=FLAGS.num_train_steps,
139
- num_warmup_steps=FLAGS.num_warmup_steps,
140
- use_tpu=FLAGS.use_tpu,
141
- num_tpu_cores=FLAGS.num_tpu_cores,
142
- eval_batch_size=FLAGS.eval_batch_size
143
- )
144
-
145
- # If TPU is not available, this will fall back to normal Estimator on CPU
146
- # or GPU.
147
- estimator = tf.contrib.tpu.TPUEstimator(
148
- use_tpu=FLAGS.use_tpu,
149
- model_fn=model_fn,
150
- config=run_config,
151
- train_batch_size=FLAGS.train_batch_size,
152
- eval_batch_size=FLAGS.eval_batch_size,
153
- params={'model_dir': FLAGS.output_dir}
154
- )
155
- if FLAGS.do_train:
156
- tf.logging.info("***** Running training *****")
157
- tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
158
- train_input_fn = input_fn_builder(
159
- input_files=input_files,
160
- seq_length=FLAGS.max_seq_length,
161
- is_training=True)
162
-
163
- estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
164
-
165
- if FLAGS.do_eval:
166
- tf.logging.info("***** Running evaluation *****")
167
- tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
168
-
169
- eval_input_fn = input_fn_builder(
170
- input_files=input_files,
171
- seq_length=FLAGS.max_seq_length,
172
- is_training=False,
173
- )
174
-
175
- result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
176
-
177
- output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
178
- with tf.gfile.GFile(output_eval_file, "w") as writer:
179
- tf.logging.info("***** Eval results *****")
180
- for key in sorted(result.keys()):
181
- tf.logging.info(" %s = %s", key, str(result[key]))
182
- writer.write("%s = %s\n" % (key, str(result[key])))
183
-
184
- if __name__ == "__main__":
185
- flags.mark_flag_as_required("input_file")
186
- flags.mark_flag_as_required("output_dir")
187
- tf.app.run()
@@ -1,234 +0,0 @@
1
- # Original work Copyright 2018 The Google AI Language Team Authors.
2
- # Modified work Copyright 2019 Rowan Zellers
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import collections
17
- import re
18
-
19
- import six
20
- import tensorflow as tf
21
- import numpy as np
22
- from tensorflow.python.lib.io import file_io
23
-
24
-
25
- def _save_np(absolute_fn, array):
26
- if absolute_fn.startswith('gs://'):
27
- with file_io.FileIO(absolute_fn, 'w') as f:
28
- np.save(f, array)
29
- else:
30
- np.save(absolute_fn, array)
31
-
32
-
33
- def assert_rank(tensor, expected_rank, name=None):
34
- """Raises an exception if the tensor rank is not of the expected rank.
35
-
36
- Args:
37
- tensor: A tf.Tensor to check the rank of.
38
- expected_rank: Python integer or list of integers, expected rank.
39
- name: Optional name of the tensor for the error message.
40
-
41
- Raises:
42
- ValueError: If the expected shape doesn't match the actual shape.
43
- """
44
- if name is None:
45
- name = tensor.name
46
-
47
- expected_rank_dict = {}
48
- if isinstance(expected_rank, six.integer_types):
49
- expected_rank_dict[expected_rank] = True
50
- else:
51
- for x in expected_rank:
52
- expected_rank_dict[x] = True
53
-
54
- actual_rank = tensor.shape.ndims
55
- if actual_rank not in expected_rank_dict:
56
- scope_name = tf.get_variable_scope().name
57
- raise ValueError(
58
- "For the tensor `%s` in scope `%s`, the actual rank "
59
- "`%d` (shape = %s) is not equal to the expected rank `%s`" %
60
- (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
61
-
62
-
63
- def get_shape_list(tensor, expected_rank=None, name=None):
64
- """Returns a list of the shape of tensor, preferring static dimensions.
65
-
66
- Args:
67
- tensor: A tf.Tensor object to find the shape of.
68
- expected_rank: (optional) int. The expected rank of `tensor`. If this is
69
- specified and the `tensor` has a different rank, and exception will be
70
- thrown.
71
- name: Optional name of the tensor for the error message.
72
-
73
- Returns:
74
- A list of dimensions of the shape of tensor. All static dimensions will
75
- be returned as python integers, and dynamic dimensions will be returned
76
- as tf.Tensor scalars.
77
- """
78
- if name is None:
79
- name = tensor.name
80
-
81
- if expected_rank is not None:
82
- assert_rank(tensor, expected_rank, name)
83
-
84
- shape = tensor.shape.as_list()
85
-
86
- non_static_indexes = []
87
- for (index, dim) in enumerate(shape):
88
- if dim is None:
89
- non_static_indexes.append(index)
90
-
91
- if not non_static_indexes:
92
- return shape
93
-
94
- dyn_shape = tf.shape(tensor)
95
- for index in non_static_indexes:
96
- shape[index] = dyn_shape[index]
97
- return shape
98
-
99
-
100
- def gelu(input_tensor):
101
- """Gaussian Error Linear Unit.
102
-
103
- This is a smoother version of the RELU.
104
- Original paper: https://arxiv.org/abs/1606.08415
105
-
106
- Args:
107
- input_tensor: float Tensor to perform activation.
108
-
109
- Returns:
110
- `input_tensor` with the GELU activation applied.
111
- """
112
- cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
113
- return input_tensor * cdf
114
-
115
-
116
- def layer_norm(input_tensor, name=None, epsilon=1e-5):
117
- """Run layer normalization on the last dimension of the tensor."""
118
- name2use = f'LayerNorm_{name}' if name is not None else name
119
- with tf.variable_scope(name2use, default_name='LayerNorm'):
120
- dim = input_tensor.shape[-1].value
121
- gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1))
122
- beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0))
123
- mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True)
124
- std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True)
125
- input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon)
126
- input_tensor = input_tensor * gamma + beta
127
- return input_tensor
128
-
129
-
130
- def dropout(input_tensor, dropout_prob):
131
- """Perform dropout.
132
-
133
- Args:
134
- input_tensor: float Tensor.
135
- dropout_prob: Python float. The probability of dropping out a value (NOT of
136
- *keeping* a dimension as in `tf.nn.dropout`).
137
-
138
- Returns:
139
- A version of `input_tensor` with dropout applied.
140
- """
141
- if dropout_prob is None or dropout_prob == 0.0:
142
- return input_tensor
143
- output = tf.nn.dropout(input_tensor, rate=dropout_prob)
144
- return output
145
-
146
-
147
- def get_attention_mask(nd, ns, *, dtype):
148
- """
149
- this is a TPU compatible version of tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd)
150
- where the lower right triangle contains 1s
151
- """
152
- i = tf.range(nd)[:, None]
153
- j = tf.range(ns)
154
- m = i >= j - ns + nd
155
- return tf.cast(m, dtype)
156
-
157
-
158
- def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
159
- """Compute the union of the current variables and checkpoint variables."""
160
- assignment_map = {}
161
- initialized_variable_names = {}
162
-
163
- name_to_variable = collections.OrderedDict()
164
- for var in tvars:
165
- name = var.name
166
- m = re.match("^(.*):\\d+$", name)
167
- if m is not None:
168
- name = m.group(1)
169
- name_to_variable[name] = var
170
-
171
- init_vars = tf.train.list_variables(init_checkpoint)
172
-
173
- assignment_map = collections.OrderedDict()
174
- for x in init_vars:
175
- (name, var) = (x[0], x[1])
176
- if name not in name_to_variable:
177
- continue
178
- assignment_map[name] = name
179
- initialized_variable_names[name] = 1
180
- initialized_variable_names[name + ":0"] = 1
181
- return (assignment_map, initialized_variable_names)
182
-
183
-
184
- def construct_scalar_host_call(metric_dict, model_dir, prefix=""):
185
- """Construct a host call to log scalars when training on TPU.
186
-
187
- Args:
188
- metric_dict: A dict of the tensors to be logged.
189
- model_dir: The location to write the summary.
190
- prefix: The prefix (if any) to prepend to the metric names.
191
-
192
- Returns:
193
- A tuple of (function, args_to_be_passed_to_said_function)
194
- """
195
- metric_names = list(metric_dict.keys())
196
-
197
- def host_call_fn(global_step, *args):
198
- """Training host call. Creates scalar summaries for training metrics.
199
-
200
- This function is executed on the CPU and should not directly reference
201
- any Tensors in the rest of the `model_fn`. To pass Tensors from the
202
- model to the `metric_fn`, provide as part of the `host_call`. See
203
- https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
204
- for more information.
205
-
206
- Arguments should match the list of `Tensor` objects passed as the second
207
- element in the tuple passed to `host_call`.
208
-
209
- Args:
210
- global_step: `Tensor with shape `[batch]` for the global_step
211
- *args: Remaining tensors to log.
212
-
213
- Returns:
214
- List of summary ops to run on the CPU host.
215
- """
216
- step = global_step[0]
217
- with tf.contrib.summary.create_file_writer(
218
- logdir=model_dir, filename_suffix=".host_call").as_default():
219
- with tf.contrib.summary.always_record_summaries():
220
- for i, name in enumerate(metric_names):
221
- tf.contrib.summary.scalar(prefix + name, args[i][0], step=step)
222
-
223
- return tf.contrib.summary.all_summary_ops()
224
-
225
- # To log the current learning rate, and gradient norm for Tensorboard, the
226
- # summary op needs to be run on the host CPU via host_call. host_call
227
- # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
228
- # dimension. These Tensors are implicitly concatenated to
229
- # [params['batch_size']].
230
- global_step_tensor = tf.reshape(
231
- tf.compat.v1.train.get_or_create_global_step(), [1])
232
- other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names]
233
-
234
- return host_call_fn, [global_step_tensor] + other_tensors