SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,1029 @@
1
+ # coding=utf-8
2
+ # Copyright 2020 The Google Research Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """The transformer encoder used by ELECTRA. Essentially BERT's with a few
17
+ additional functionalities added.
18
+ """
19
+
20
+ from __future__ import absolute_import
21
+ from __future__ import division
22
+ from __future__ import print_function
23
+
24
+ import collections
25
+ import copy
26
+ import json
27
+ import math
28
+ import re
29
+
30
+ import numpy as np
31
+ import six
32
+ import tensorflow as tf
33
+ from tensorflow.contrib import layers as contrib_layers
34
+
35
+
36
+ class BertConfig(object):
37
+ """Configuration for `BertModel` (ELECTRA uses the same model as BERT)."""
38
+
39
+ def __init__(self,
40
+ vocab_size,
41
+ hidden_size=768,
42
+ num_hidden_layers=12,
43
+ num_attention_heads=12,
44
+ intermediate_size=3072,
45
+ hidden_act="gelu",
46
+ hidden_dropout_prob=0.1,
47
+ attention_probs_dropout_prob=0.1,
48
+ max_position_embeddings=512,
49
+ type_vocab_size=2,
50
+ initializer_range=0.02):
51
+ """Constructs BertConfig.
52
+
53
+ Args:
54
+ vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
55
+ hidden_size: Size of the encoder layers and the pooler layer.
56
+ num_hidden_layers: Number of hidden layers in the Transformer encoder.
57
+ num_attention_heads: Number of attention heads for each attention layer in
58
+ the Transformer encoder.
59
+ intermediate_size: The size of the "intermediate" (i.e., feed-forward)
60
+ layer in the Transformer encoder.
61
+ hidden_act: The non-linear activation function (function or string) in the
62
+ encoder and pooler.
63
+ hidden_dropout_prob: The dropout probability for all fully connected
64
+ layers in the embeddings, encoder, and pooler.
65
+ attention_probs_dropout_prob: The dropout ratio for the attention
66
+ probabilities.
67
+ max_position_embeddings: The maximum sequence length that this model might
68
+ ever be used with. Typically set this to something large just in case
69
+ (e.g., 512 or 1024 or 2048).
70
+ type_vocab_size: The vocabulary size of the `token_type_ids` passed into
71
+ `BertModel`.
72
+ initializer_range: The stdev of the truncated_normal_initializer for
73
+ initializing all weight matrices.
74
+ """
75
+ self.vocab_size = vocab_size
76
+ self.hidden_size = hidden_size
77
+ self.num_hidden_layers = num_hidden_layers
78
+ self.num_attention_heads = num_attention_heads
79
+ self.hidden_act = hidden_act
80
+ self.intermediate_size = intermediate_size
81
+ self.hidden_dropout_prob = hidden_dropout_prob
82
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
83
+ self.max_position_embeddings = max_position_embeddings
84
+ self.type_vocab_size = type_vocab_size
85
+ self.initializer_range = initializer_range
86
+
87
+ @classmethod
88
+ def from_dict(cls, json_object):
89
+ """Constructs a `BertConfig` from a Python dictionary of parameters."""
90
+ config = BertConfig(vocab_size=None)
91
+ for (key, value) in six.iteritems(json_object):
92
+ config.__dict__[key] = value
93
+ return config
94
+
95
+ @classmethod
96
+ def from_json_file(cls, json_file):
97
+ """Constructs a `BertConfig` from a json file of parameters."""
98
+ with tf.io.gfile.GFile(json_file, "r") as reader:
99
+ text = reader.read()
100
+ return cls.from_dict(json.loads(text))
101
+
102
+ def to_dict(self):
103
+ """Serializes this instance to a Python dictionary."""
104
+ output = copy.deepcopy(self.__dict__)
105
+ return output
106
+
107
+ def to_json_string(self):
108
+ """Serializes this instance to a JSON string."""
109
+ return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
110
+
111
+
112
+ class BertModel(object):
113
+ """BERT model. Although the training algorithm is different, the transformer
114
+ model for ELECTRA is the same as BERT's.
115
+
116
+ Example usage:
117
+
118
+ ```python
119
+ # Already been converted into WordPiece token ids
120
+ input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
121
+ input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
122
+ token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
123
+
124
+ config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
125
+ num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
126
+
127
+ model = modeling.BertModel(config=config, is_training=True,
128
+ input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
129
+
130
+ label_embeddings = tf.get_variable(...)
131
+ pooled_output = model.get_pooled_output()
132
+ logits = tf.matmul(pooled_output, label_embeddings)
133
+ ...
134
+ ```
135
+ """
136
+
137
+ def __init__(self,
138
+ bert_config,
139
+ is_training,
140
+ input_ids,
141
+ input_mask=None,
142
+ token_type_ids=None,
143
+ use_one_hot_embeddings=True,
144
+ scope=None,
145
+ embedding_size=None,
146
+ input_embeddings=None,
147
+ input_reprs=None,
148
+ update_embeddings=True,
149
+ untied_embeddings=False,
150
+ ltr=False,
151
+ rtl=False):
152
+ """Constructor for BertModel.
153
+
154
+ Args:
155
+ bert_config: `BertConfig` instance.
156
+ is_training: bool. true for training model, false for eval model. Controls
157
+ whether dropout will be applied.
158
+ input_ids: int32 Tensor of shape [batch_size, seq_length].
159
+ input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
160
+ token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
161
+ use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
162
+ embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
163
+ it is much faster if this is True, on the CPU or GPU, it is faster if
164
+ this is False.
165
+ scope: (optional) variable scope. Defaults to "electra".
166
+
167
+ Raises:
168
+ ValueError: The config is invalid or one of the input tensor shapes
169
+ is invalid.
170
+ """
171
+ bert_config = copy.deepcopy(bert_config)
172
+ if not is_training:
173
+ bert_config.hidden_dropout_prob = 0.0
174
+ bert_config.attention_probs_dropout_prob = 0.0
175
+
176
+ input_shape = get_shape_list(token_type_ids, expected_rank=2)
177
+ batch_size = input_shape[0]
178
+ seq_length = input_shape[1]
179
+
180
+ if input_mask is None:
181
+ input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
182
+
183
+ assert token_type_ids is not None
184
+
185
+ if input_reprs is None:
186
+ if input_embeddings is None:
187
+ with tf.variable_scope(
188
+ (scope if untied_embeddings else "electra") + "/embeddings",
189
+ reuse=tf.AUTO_REUSE):
190
+ # Perform embedding lookup on the word ids
191
+ if embedding_size is None:
192
+ embedding_size = bert_config.hidden_size
193
+ (self.token_embeddings, self.embedding_table) = embedding_lookup(
194
+ input_ids=input_ids,
195
+ vocab_size=bert_config.vocab_size,
196
+ embedding_size=embedding_size,
197
+ initializer_range=bert_config.initializer_range,
198
+ word_embedding_name="word_embeddings",
199
+ use_one_hot_embeddings=use_one_hot_embeddings)
200
+ else:
201
+ self.token_embeddings = input_embeddings
202
+
203
+ with tf.variable_scope(
204
+ (scope if untied_embeddings else "electra") + "/embeddings",
205
+ reuse=tf.AUTO_REUSE):
206
+ # Add positional embeddings and token type embeddings, then layer
207
+ # normalize and perform dropout.
208
+ self.embedding_output = embedding_postprocessor(
209
+ input_tensor=self.token_embeddings,
210
+ use_token_type=True,
211
+ token_type_ids=token_type_ids,
212
+ token_type_vocab_size=bert_config.type_vocab_size,
213
+ token_type_embedding_name="token_type_embeddings",
214
+ use_position_embeddings=True,
215
+ position_embedding_name="position_embeddings",
216
+ initializer_range=bert_config.initializer_range,
217
+ max_position_embeddings=bert_config.max_position_embeddings,
218
+ dropout_prob=bert_config.hidden_dropout_prob)
219
+ else:
220
+ self.embedding_output = input_reprs
221
+ if not update_embeddings:
222
+ self.embedding_output = tf.stop_gradient(self.embedding_output)
223
+
224
+ with tf.variable_scope(scope, default_name="electra"):
225
+ if self.embedding_output.shape[-1] != bert_config.hidden_size:
226
+ self.embedding_output = tf.layers.dense(
227
+ self.embedding_output, bert_config.hidden_size,
228
+ name="embeddings_project")
229
+
230
+ with tf.variable_scope("encoder"):
231
+ # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
232
+ # mask of shape [batch_size, seq_length, seq_length] which is used
233
+ # for the attention scores.
234
+ attention_mask = create_attention_mask_from_input_mask(
235
+ token_type_ids, input_mask)
236
+
237
+ # Add causal masking to the attention for running the transformer
238
+ # left-to-right or right-to-left
239
+ if ltr or rtl:
240
+ causal_mask = tf.ones((seq_length, seq_length))
241
+ if ltr:
242
+ causal_mask = tf.matrix_band_part(causal_mask, -1, 0)
243
+ else:
244
+ causal_mask = tf.matrix_band_part(causal_mask, 0, -1)
245
+ attention_mask *= tf.expand_dims(causal_mask, 0)
246
+
247
+ # Run the stacked transformer. Output shapes
248
+ # sequence_output: [batch_size, seq_length, hidden_size]
249
+ # pooled_output: [batch_size, hidden_size]
250
+ # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size].
251
+ # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length]
252
+ (self.all_layer_outputs, self.attn_maps) = transformer_model(
253
+ input_tensor=self.embedding_output,
254
+ attention_mask=attention_mask,
255
+ hidden_size=bert_config.hidden_size,
256
+ num_hidden_layers=bert_config.num_hidden_layers,
257
+ num_attention_heads=bert_config.num_attention_heads,
258
+ intermediate_size=bert_config.intermediate_size,
259
+ intermediate_act_fn=get_activation(bert_config.hidden_act),
260
+ hidden_dropout_prob=bert_config.hidden_dropout_prob,
261
+ attention_probs_dropout_prob=
262
+ bert_config.attention_probs_dropout_prob,
263
+ initializer_range=bert_config.initializer_range,
264
+ do_return_all_layers=True)
265
+ self.sequence_output = self.all_layer_outputs[-1]
266
+ self.pooled_output = self.sequence_output[:, 0]
267
+
268
+ def get_pooled_output(self):
269
+ return self.pooled_output
270
+
271
+ def get_sequence_output(self):
272
+ """Gets final hidden layer of encoder.
273
+
274
+ Returns:
275
+ float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
276
+ to the final hidden of the transformer encoder.
277
+ """
278
+ return self.sequence_output
279
+
280
+ def get_all_encoder_layers(self):
281
+ return self.all_layer_outputs
282
+
283
+ def get_embedding_output(self):
284
+ """Gets output of the embedding lookup (i.e., input to the transformer).
285
+
286
+ Returns:
287
+ float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
288
+ to the output of the embedding layer, after summing the word
289
+ embeddings with the positional embeddings and the token type embeddings,
290
+ then performing layer normalization. This is the input to the transformer.
291
+ """
292
+ return self.embedding_output
293
+
294
+ def get_embedding_table(self):
295
+ return self.embedding_table
296
+
297
+
298
+ def gelu(input_tensor):
299
+ """Gaussian Error Linear Unit.
300
+
301
+ This is a smoother version of the RELU.
302
+ Original paper: https://arxiv.org/abs/1606.08415
303
+
304
+ Args:
305
+ input_tensor: float Tensor to perform activation.
306
+
307
+ Returns:
308
+ `input_tensor` with the GELU activation applied.
309
+ """
310
+ cdf = 0.5 * (1.0 + tf.math.erf(input_tensor / tf.sqrt(2.0)))
311
+ return input_tensor * cdf
312
+
313
+
314
+ def get_activation(activation_string):
315
+ """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
316
+
317
+ Args:
318
+ activation_string: String name of the activation function.
319
+
320
+ Returns:
321
+ A Python function corresponding to the activation function. If
322
+ `activation_string` is None, empty, or "linear", this will return None.
323
+ If `activation_string` is not a string, it will return `activation_string`.
324
+
325
+ Raises:
326
+ ValueError: The `activation_string` does not correspond to a known
327
+ activation.
328
+ """
329
+
330
+ # We assume that anything that"s not a string is already an activation
331
+ # function, so we just return it.
332
+ if not isinstance(activation_string, six.string_types):
333
+ return activation_string
334
+
335
+ if not activation_string:
336
+ return None
337
+
338
+ act = activation_string.lower()
339
+ if act == "linear":
340
+ return None
341
+ elif act == "relu":
342
+ return tf.nn.relu
343
+ elif act == "gelu":
344
+ return gelu
345
+ elif act == "tanh":
346
+ return tf.tanh
347
+ else:
348
+ raise ValueError("Unsupported activation: %s" % act)
349
+
350
+
351
+ def get_assignment_map_from_checkpoint(tvars, init_checkpoint, prefix=""):
352
+ """Compute the union of the current variables and checkpoint variables."""
353
+ name_to_variable = collections.OrderedDict()
354
+ for var in tvars:
355
+ name = var.name
356
+ m = re.match("^(.*):\\d+$", name)
357
+ if m is not None:
358
+ name = m.group(1)
359
+ name_to_variable[name] = var
360
+
361
+ initialized_variable_names = {}
362
+ assignment_map = collections.OrderedDict()
363
+ for x in tf.train.list_variables(init_checkpoint):
364
+ (name, var) = (x[0], x[1])
365
+ if prefix + name not in name_to_variable:
366
+ continue
367
+ assignment_map[name] = prefix + name
368
+ initialized_variable_names[name] = 1
369
+ initialized_variable_names[name + ":0"] = 1
370
+
371
+ return assignment_map, initialized_variable_names
372
+
373
+
374
+ def dropout(input_tensor, dropout_prob):
375
+ """Perform dropout.
376
+
377
+ Args:
378
+ input_tensor: float Tensor.
379
+ dropout_prob: Python float. The probability of dropping out a value (NOT of
380
+ *keeping* a dimension as in `tf.nn.dropout`).
381
+
382
+ Returns:
383
+ A version of `input_tensor` with dropout applied.
384
+ """
385
+ if dropout_prob is None or dropout_prob == 0.0:
386
+ return input_tensor
387
+
388
+ output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
389
+ return output
390
+
391
+
392
+ def layer_norm(input_tensor, name=None):
393
+ """Run layer normalization on the last dimension of the tensor."""
394
+ return contrib_layers.layer_norm(
395
+ inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
396
+
397
+
398
+ def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
399
+ """Runs layer normalization followed by dropout."""
400
+ output_tensor = layer_norm(input_tensor, name)
401
+ output_tensor = dropout(output_tensor, dropout_prob)
402
+ return output_tensor
403
+
404
+
405
+ def create_initializer(initializer_range=0.02):
406
+ """Creates a `truncated_normal_initializer` with the given range."""
407
+ return tf.truncated_normal_initializer(stddev=initializer_range)
408
+
409
+
410
+ def embedding_lookup(input_ids,
411
+ vocab_size,
412
+ embedding_size=128,
413
+ initializer_range=0.02,
414
+ word_embedding_name="word_embeddings",
415
+ use_one_hot_embeddings=False):
416
+ """Looks up words embeddings for id tensor.
417
+
418
+ Args:
419
+ input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
420
+ ids.
421
+ vocab_size: int. Size of the embedding vocabulary.
422
+ embedding_size: int. Width of the word embeddings.
423
+ initializer_range: float. Embedding initialization range.
424
+ word_embedding_name: string. Name of the embedding table.
425
+ use_one_hot_embeddings: bool. If True, use one-hot method for word
426
+ embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
427
+ for TPUs.
428
+
429
+ Returns:
430
+ float Tensor of shape [batch_size, seq_length, embedding_size].
431
+ """
432
+ # This function assumes that the input is of shape [batch_size, seq_length,
433
+ # num_inputs].
434
+ #
435
+ # If the input is a 2D tensor of shape [batch_size, seq_length], we
436
+ # reshape to [batch_size, seq_length, 1].
437
+ original_dims = input_ids.shape.ndims
438
+ if original_dims == 2:
439
+ input_ids = tf.expand_dims(input_ids, axis=[-1])
440
+
441
+ embedding_table = tf.get_variable(
442
+ name=word_embedding_name,
443
+ shape=[vocab_size, embedding_size],
444
+ initializer=create_initializer(initializer_range))
445
+
446
+ if original_dims == 3:
447
+ input_shape = get_shape_list(input_ids)
448
+ tf.reshape(input_ids, [-1, input_shape[-1]])
449
+ output = tf.matmul(input_ids, embedding_table)
450
+ output = tf.reshape(output,
451
+ [input_shape[0], input_shape[1], embedding_size])
452
+ else:
453
+ if use_one_hot_embeddings:
454
+ flat_input_ids = tf.reshape(input_ids, [-1])
455
+ one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
456
+ output = tf.matmul(one_hot_input_ids, embedding_table)
457
+ else:
458
+ output = tf.nn.embedding_lookup(embedding_table, input_ids)
459
+
460
+ input_shape = get_shape_list(input_ids)
461
+
462
+ output = tf.reshape(output,
463
+ input_shape[0:-1] + [input_shape[-1] * embedding_size])
464
+ return output, embedding_table
465
+
466
+
467
+ def embedding_postprocessor(input_tensor,
468
+ use_token_type=False,
469
+ token_type_ids=None,
470
+ token_type_vocab_size=16,
471
+ token_type_embedding_name="token_type_embeddings",
472
+ use_position_embeddings=True,
473
+ position_embedding_name="position_embeddings",
474
+ initializer_range=0.02,
475
+ max_position_embeddings=512,
476
+ dropout_prob=0.1):
477
+ """Performs various post-processing on a word embedding tensor.
478
+
479
+ Args:
480
+ input_tensor: float Tensor of shape [batch_size, seq_length,
481
+ embedding_size].
482
+ use_token_type: bool. Whether to add embeddings for `token_type_ids`.
483
+ token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
484
+ Must be specified if `use_token_type` is True.
485
+ token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
486
+ token_type_embedding_name: string. The name of the embedding table variable
487
+ for token type ids.
488
+ use_position_embeddings: bool. Whether to add position embeddings for the
489
+ position of each token in the sequence.
490
+ position_embedding_name: string. The name of the embedding table variable
491
+ for positional embeddings.
492
+ initializer_range: float. Range of the weight initialization.
493
+ max_position_embeddings: int. Maximum sequence length that might ever be
494
+ used with this model. This can be longer than the sequence length of
495
+ input_tensor, but cannot be shorter.
496
+ dropout_prob: float. Dropout probability applied to the final output tensor.
497
+
498
+ Returns:
499
+ float tensor with same shape as `input_tensor`.
500
+
501
+ Raises:
502
+ ValueError: One of the tensor shapes or input values is invalid.
503
+ """
504
+ input_shape = get_shape_list(input_tensor, expected_rank=3)
505
+ batch_size = input_shape[0]
506
+ seq_length = input_shape[1]
507
+ width = input_shape[2]
508
+
509
+ output = input_tensor
510
+
511
+ if use_token_type:
512
+ if token_type_ids is None:
513
+ raise ValueError("`token_type_ids` must be specified if"
514
+ "`use_token_type` is True.")
515
+ token_type_table = tf.get_variable(
516
+ name=token_type_embedding_name,
517
+ shape=[token_type_vocab_size, width],
518
+ initializer=create_initializer(initializer_range))
519
+ # This vocab will be small so we always do one-hot here, since it is always
520
+ # faster for a small vocabulary.
521
+ flat_token_type_ids = tf.reshape(token_type_ids, [-1])
522
+ one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
523
+ token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
524
+ token_type_embeddings = tf.reshape(token_type_embeddings,
525
+ [batch_size, seq_length, width])
526
+ output += token_type_embeddings
527
+
528
+ if use_position_embeddings:
529
+ assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
530
+ with tf.control_dependencies([assert_op]):
531
+ full_position_embeddings = tf.get_variable(
532
+ name=position_embedding_name,
533
+ shape=[max_position_embeddings, width],
534
+ initializer=create_initializer(initializer_range))
535
+ # Since the position embedding table is a learned variable, we create it
536
+ # using a (long) sequence length `max_position_embeddings`. The actual
537
+ # sequence length might be shorter than this, for faster training of
538
+ # tasks that do not have long sequences.
539
+ #
540
+ # So `full_position_embeddings` is effectively an embedding table
541
+ # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
542
+ # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
543
+ # perform a slice.
544
+ position_embeddings = tf.slice(full_position_embeddings, [0, 0],
545
+ [seq_length, -1])
546
+ num_dims = len(output.shape.as_list())
547
+
548
+ # Only the last two dimensions are relevant (`seq_length` and `width`), so
549
+ # we broadcast among the first dimensions, which is typically just
550
+ # the batch size.
551
+ position_broadcast_shape = []
552
+ for _ in range(num_dims - 2):
553
+ position_broadcast_shape.append(1)
554
+ position_broadcast_shape.extend([seq_length, width])
555
+ position_embeddings = tf.reshape(position_embeddings,
556
+ position_broadcast_shape)
557
+ output += position_embeddings
558
+
559
+ output = layer_norm_and_dropout(output, dropout_prob)
560
+ return output
561
+
562
+
563
+ def create_attention_mask_from_input_mask(from_tensor, to_mask):
564
+ """Create 3D attention mask from a 2D tensor mask.
565
+
566
+ Args:
567
+ from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
568
+ to_mask: int32 Tensor of shape [batch_size, to_seq_length].
569
+
570
+ Returns:
571
+ float Tensor of shape [batch_size, from_seq_length, to_seq_length].
572
+ """
573
+ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
574
+ batch_size = from_shape[0]
575
+ from_seq_length = from_shape[1]
576
+
577
+ to_shape = get_shape_list(to_mask, expected_rank=2)
578
+ to_seq_length = to_shape[1]
579
+
580
+ to_mask = tf.cast(
581
+ tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
582
+
583
+ # We don't assume that `from_tensor` is a mask (although it could be). We
584
+ # don't actually care if we attend *from* padding tokens (only *to* padding)
585
+ # tokens so we create a tensor of all ones.
586
+ #
587
+ # `broadcast_ones` = [batch_size, from_seq_length, 1]
588
+ broadcast_ones = tf.ones(
589
+ shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
590
+
591
+ # Here we broadcast along two dimensions to create the mask.
592
+ mask = broadcast_ones * to_mask
593
+
594
+ return mask
595
+
596
+
597
+ def attention_layer(from_tensor,
598
+ to_tensor,
599
+ attention_mask=None,
600
+ num_attention_heads=1,
601
+ size_per_head=512,
602
+ query_act=None,
603
+ key_act=None,
604
+ value_act=None,
605
+ attention_probs_dropout_prob=0.0,
606
+ initializer_range=0.02,
607
+ do_return_2d_tensor=False,
608
+ batch_size=None,
609
+ from_seq_length=None,
610
+ to_seq_length=None):
611
+ """Performs multi-headed attention from `from_tensor` to `to_tensor`.
612
+
613
+ This is an implementation of multi-headed attention based on "Attention
614
+ is all you Need". If `from_tensor` and `to_tensor` are the same, then
615
+ this is self-attention. Each timestep in `from_tensor` attends to the
616
+ corresponding sequence in `to_tensor`, and returns a fixed-with vector.
617
+
618
+ This function first projects `from_tensor` into a "query" tensor and
619
+ `to_tensor` into "key" and "value" tensors. These are (effectively) a list
620
+ of tensors of length `num_attention_heads`, where each tensor is of shape
621
+ [batch_size, seq_length, size_per_head].
622
+
623
+ Then, the query and key tensors are dot-producted and scaled. These are
624
+ softmaxed to obtain attention probabilities. The value tensors are then
625
+ interpolated by these probabilities, then concatenated back to a single
626
+ tensor and returned.
627
+
628
+ In practice, the multi-headed attention are done with transposes and
629
+ reshapes rather than actual separate tensors.
630
+
631
+ Args:
632
+ from_tensor: float Tensor of shape [batch_size, from_seq_length,
633
+ from_width].
634
+ to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
635
+ attention_mask: (optional) int32 Tensor of shape [batch_size,
636
+ from_seq_length, to_seq_length]. The values should be 1 or 0. The
637
+ attention scores will effectively be set to -infinity for any positions in
638
+ the mask that are 0, and will be unchanged for positions that are 1.
639
+ num_attention_heads: int. Number of attention heads.
640
+ size_per_head: int. Size of each attention head.
641
+ query_act: (optional) Activation function for the query transform.
642
+ key_act: (optional) Activation function for the key transform.
643
+ value_act: (optional) Activation function for the value transform.
644
+ attention_probs_dropout_prob: (optional) float. Dropout probability of the
645
+ attention probabilities.
646
+ initializer_range: float. Range of the weight initializer.
647
+ do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
648
+ * from_seq_length, num_attention_heads * size_per_head]. If False, the
649
+ output will be of shape [batch_size, from_seq_length, num_attention_heads
650
+ * size_per_head].
651
+ batch_size: (Optional) int. If the input is 2D, this might be the batch size
652
+ of the 3D version of the `from_tensor` and `to_tensor`.
653
+ from_seq_length: (Optional) If the input is 2D, this might be the seq length
654
+ of the 3D version of the `from_tensor`.
655
+ to_seq_length: (Optional) If the input is 2D, this might be the seq length
656
+ of the 3D version of the `to_tensor`.
657
+
658
+ Returns:
659
+ float Tensor of shape [batch_size, from_seq_length,
660
+ num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
661
+ true, this will be of shape [batch_size * from_seq_length,
662
+ num_attention_heads * size_per_head]).
663
+
664
+ Raises:
665
+ ValueError: Any of the arguments or tensor shapes are invalid.
666
+ """
667
+
668
+ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
669
+ seq_length, width):
670
+ output_tensor = tf.reshape(
671
+ input_tensor, [batch_size, seq_length, num_attention_heads, width])
672
+
673
+ output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
674
+ return output_tensor
675
+
676
+ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
677
+ to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
678
+
679
+ if len(from_shape) != len(to_shape):
680
+ raise ValueError(
681
+ "The rank of `from_tensor` must match the rank of `to_tensor`.")
682
+
683
+ if len(from_shape) == 3:
684
+ batch_size = from_shape[0]
685
+ from_seq_length = from_shape[1]
686
+ to_seq_length = to_shape[1]
687
+ elif len(from_shape) == 2:
688
+ if batch_size is None or from_seq_length is None or to_seq_length is None:
689
+ raise ValueError(
690
+ "When passing in rank 2 tensors to attention_layer, the values "
691
+ "for `batch_size`, `from_seq_length`, and `to_seq_length` "
692
+ "must all be specified.")
693
+
694
+ # Scalar dimensions referenced here:
695
+ # B = batch size (number of sequences)
696
+ # F = `from_tensor` sequence length
697
+ # T = `to_tensor` sequence length
698
+ # N = `num_attention_heads`
699
+ # H = `size_per_head`
700
+
701
+ from_tensor_2d = reshape_to_matrix(from_tensor)
702
+ to_tensor_2d = reshape_to_matrix(to_tensor)
703
+
704
+ # `query_layer` = [B*F, N*H]
705
+ query_layer = tf.layers.dense(
706
+ from_tensor_2d,
707
+ num_attention_heads * size_per_head,
708
+ activation=query_act,
709
+ name="query",
710
+ kernel_initializer=create_initializer(initializer_range))
711
+
712
+ # `key_layer` = [B*T, N*H]
713
+ key_layer = tf.layers.dense(
714
+ to_tensor_2d,
715
+ num_attention_heads * size_per_head,
716
+ activation=key_act,
717
+ name="key",
718
+ kernel_initializer=create_initializer(initializer_range))
719
+
720
+ # `value_layer` = [B*T, N*H]
721
+ value_layer = tf.layers.dense(
722
+ to_tensor_2d,
723
+ num_attention_heads * size_per_head,
724
+ activation=value_act,
725
+ name="value",
726
+ kernel_initializer=create_initializer(initializer_range))
727
+
728
+ # `query_layer` = [B, N, F, H]
729
+ query_layer = transpose_for_scores(query_layer, batch_size,
730
+ num_attention_heads, from_seq_length,
731
+ size_per_head)
732
+
733
+ # `key_layer` = [B, N, T, H]
734
+ key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
735
+ to_seq_length, size_per_head)
736
+
737
+ # Take the dot product between "query" and "key" to get the raw
738
+ # attention scores.
739
+ # `attention_scores` = [B, N, F, T]
740
+ attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
741
+ attention_scores = tf.multiply(attention_scores,
742
+ 1.0 / math.sqrt(float(size_per_head)))
743
+
744
+ if attention_mask is not None:
745
+ # `attention_mask` = [B, 1, F, T]
746
+ attention_mask = tf.expand_dims(attention_mask, axis=[1])
747
+
748
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
749
+ # masked positions, this operation will create a tensor which is 0.0 for
750
+ # positions we want to attend and -10000.0 for masked positions.
751
+ adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
752
+
753
+ # Since we are adding it to the raw scores before the softmax, this is
754
+ # effectively the same as removing these entirely.
755
+ attention_scores += adder
756
+
757
+ # Normalize the attention scores to probabilities.
758
+ # `attention_probs` = [B, N, F, T]
759
+ attention_probs = tf.nn.softmax(attention_scores)
760
+
761
+ # This is actually dropping out entire tokens to attend to, which might
762
+ # seem a bit unusual, but is taken from the original Transformer paper.
763
+ attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
764
+
765
+ # `value_layer` = [B, T, N, H]
766
+ value_layer = tf.reshape(
767
+ value_layer,
768
+ [batch_size, to_seq_length, num_attention_heads, size_per_head])
769
+
770
+ # `value_layer` = [B, N, T, H]
771
+ value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
772
+
773
+ # `context_layer` = [B, N, F, H]
774
+ context_layer = tf.matmul(attention_probs, value_layer)
775
+
776
+ # `context_layer` = [B, F, N, H]
777
+ context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
778
+
779
+ if do_return_2d_tensor:
780
+ # `context_layer` = [B*F, N*H]
781
+ context_layer = tf.reshape(
782
+ context_layer,
783
+ [batch_size * from_seq_length, num_attention_heads * size_per_head])
784
+ else:
785
+ # `context_layer` = [B, F, N*H]
786
+ context_layer = tf.reshape(
787
+ context_layer,
788
+ [batch_size, from_seq_length, num_attention_heads * size_per_head])
789
+
790
+ return context_layer, attention_probs
791
+
792
+
793
+ def transformer_model(input_tensor,
794
+ attention_mask=None,
795
+ hidden_size=768,
796
+ num_hidden_layers=12,
797
+ num_attention_heads=12,
798
+ intermediate_size=3072,
799
+ intermediate_act_fn=gelu,
800
+ hidden_dropout_prob=0.1,
801
+ attention_probs_dropout_prob=0.1,
802
+ initializer_range=0.02,
803
+ do_return_all_layers=False):
804
+ """Multi-headed, multi-layer Transformer from "Attention is All You Need".
805
+
806
+ This is almost an exact implementation of the original Transformer encoder.
807
+
808
+ See the original paper:
809
+ https://arxiv.org/abs/1706.03762
810
+
811
+ Also see:
812
+ https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
813
+
814
+ Args:
815
+ input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
816
+ attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
817
+ seq_length], with 1 for positions that can be attended to and 0 in
818
+ positions that should not be.
819
+ hidden_size: int. Hidden size of the Transformer.
820
+ num_hidden_layers: int. Number of layers (blocks) in the Transformer.
821
+ num_attention_heads: int. Number of attention heads in the Transformer.
822
+ intermediate_size: int. The size of the "intermediate" (a.k.a., feed
823
+ forward) layer.
824
+ intermediate_act_fn: function. The non-linear activation function to apply
825
+ to the output of the intermediate/feed-forward layer.
826
+ hidden_dropout_prob: float. Dropout probability for the hidden layers.
827
+ attention_probs_dropout_prob: float. Dropout probability of the attention
828
+ probabilities.
829
+ initializer_range: float. Range of the initializer (stddev of truncated
830
+ normal).
831
+ do_return_all_layers: Whether to also return all layers or just the final
832
+ layer.
833
+
834
+ Returns:
835
+ float Tensor of shape [batch_size, seq_length, hidden_size], the final
836
+ hidden layer of the Transformer.
837
+
838
+ Raises:
839
+ ValueError: A Tensor shape or parameter is invalid.
840
+ """
841
+ if hidden_size % num_attention_heads != 0:
842
+ raise ValueError(
843
+ "The hidden size (%d) is not a multiple of the number of attention "
844
+ "heads (%d)" % (hidden_size, num_attention_heads))
845
+
846
+ attention_head_size = int(hidden_size / num_attention_heads)
847
+ input_shape = get_shape_list(input_tensor, expected_rank=3)
848
+ batch_size = input_shape[0]
849
+ seq_length = input_shape[1]
850
+ input_width = input_shape[2]
851
+
852
+ # The Transformer performs sum residuals on all layers so the input needs
853
+ # to be the same as the hidden size.
854
+ if input_width != hidden_size:
855
+ raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
856
+ (input_width, hidden_size))
857
+
858
+ # We keep the representation as a 2D tensor to avoid re-shaping it back and
859
+ # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
860
+ # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
861
+ # help the optimizer.
862
+ prev_output = reshape_to_matrix(input_tensor)
863
+
864
+ attn_maps = []
865
+ all_layer_outputs = []
866
+ for layer_idx in range(num_hidden_layers):
867
+ with tf.variable_scope("layer_%d" % layer_idx):
868
+ with tf.variable_scope("attention"):
869
+ attention_heads = []
870
+ with tf.variable_scope("self"):
871
+ attention_head, probs = attention_layer(
872
+ from_tensor=prev_output,
873
+ to_tensor=prev_output,
874
+ attention_mask=attention_mask,
875
+ num_attention_heads=num_attention_heads,
876
+ size_per_head=attention_head_size,
877
+ attention_probs_dropout_prob=attention_probs_dropout_prob,
878
+ initializer_range=initializer_range,
879
+ do_return_2d_tensor=True,
880
+ batch_size=batch_size,
881
+ from_seq_length=seq_length,
882
+ to_seq_length=seq_length)
883
+ attention_heads.append(attention_head)
884
+ attn_maps.append(probs)
885
+
886
+ attention_output = None
887
+ if len(attention_heads) == 1:
888
+ attention_output = attention_heads[0]
889
+ else:
890
+ # In the case where we have other sequences, we just concatenate
891
+ # them to the self-attention head before the projection.
892
+ attention_output = tf.concat(attention_heads, axis=-1)
893
+
894
+ # Run a linear projection of `hidden_size` then add a residual
895
+ # with `layer_input`.
896
+ with tf.variable_scope("output"):
897
+ attention_output = tf.layers.dense(
898
+ attention_output,
899
+ hidden_size,
900
+ kernel_initializer=create_initializer(initializer_range))
901
+ attention_output = dropout(attention_output, hidden_dropout_prob)
902
+ attention_output = layer_norm(attention_output + prev_output)
903
+
904
+ # The activation is only applied to the "intermediate" hidden layer.
905
+ with tf.variable_scope("intermediate"):
906
+ intermediate_output = tf.layers.dense(
907
+ attention_output,
908
+ intermediate_size,
909
+ activation=intermediate_act_fn,
910
+ kernel_initializer=create_initializer(initializer_range))
911
+
912
+ # Down-project back to `hidden_size` then add the residual.
913
+ with tf.variable_scope("output"):
914
+ prev_output = tf.layers.dense(
915
+ intermediate_output,
916
+ hidden_size,
917
+ kernel_initializer=create_initializer(initializer_range))
918
+ prev_output = dropout(prev_output, hidden_dropout_prob)
919
+ prev_output = layer_norm(prev_output + attention_output)
920
+ all_layer_outputs.append(prev_output)
921
+
922
+ attn_maps = tf.stack(attn_maps, 0)
923
+ if do_return_all_layers:
924
+ return tf.stack([reshape_from_matrix(layer, input_shape)
925
+ for layer in all_layer_outputs], 0), attn_maps
926
+ else:
927
+ return reshape_from_matrix(prev_output, input_shape), attn_maps
928
+
929
+
930
+ def get_shape_list(tensor, expected_rank=None, name=None):
931
+ """Returns a list of the shape of tensor, preferring static dimensions.
932
+
933
+ Args:
934
+ tensor: A tf.Tensor object to find the shape of.
935
+ expected_rank: (optional) int. The expected rank of `tensor`. If this is
936
+ specified and the `tensor` has a different rank, and exception will be
937
+ thrown.
938
+ name: Optional name of the tensor for the error message.
939
+
940
+ Returns:
941
+ A list of dimensions of the shape of tensor. All static dimensions will
942
+ be returned as python integers, and dynamic dimensions will be returned
943
+ as tf.Tensor scalars.
944
+ """
945
+ if isinstance(tensor, np.ndarray) or isinstance(tensor, list):
946
+ shape = np.array(tensor).shape
947
+ if isinstance(expected_rank, six.integer_types):
948
+ assert len(shape) == expected_rank
949
+ elif expected_rank is not None:
950
+ assert len(shape) in expected_rank
951
+ return shape
952
+
953
+ if name is None:
954
+ name = tensor.name
955
+
956
+ if expected_rank is not None:
957
+ assert_rank(tensor, expected_rank, name)
958
+
959
+ shape = tensor.shape.as_list()
960
+
961
+ non_static_indexes = []
962
+ for (index, dim) in enumerate(shape):
963
+ if dim is None:
964
+ non_static_indexes.append(index)
965
+
966
+ if not non_static_indexes:
967
+ return shape
968
+
969
+ dyn_shape = tf.shape(tensor)
970
+ for index in non_static_indexes:
971
+ shape[index] = dyn_shape[index]
972
+ return shape
973
+
974
+
975
+ def reshape_to_matrix(input_tensor):
976
+ """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
977
+ ndims = input_tensor.shape.ndims
978
+ if ndims < 2:
979
+ raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
980
+ (input_tensor.shape))
981
+ if ndims == 2:
982
+ return input_tensor
983
+
984
+ width = input_tensor.shape[-1]
985
+ output_tensor = tf.reshape(input_tensor, [-1, width])
986
+ return output_tensor
987
+
988
+
989
+ def reshape_from_matrix(output_tensor, orig_shape_list):
990
+ """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
991
+ if len(orig_shape_list) == 2:
992
+ return output_tensor
993
+
994
+ output_shape = get_shape_list(output_tensor)
995
+
996
+ orig_dims = orig_shape_list[0:-1]
997
+ width = output_shape[-1]
998
+
999
+ return tf.reshape(output_tensor, orig_dims + [width])
1000
+
1001
+
1002
+ def assert_rank(tensor, expected_rank, name=None):
1003
+ """Raises an exception if the tensor rank is not of the expected rank.
1004
+
1005
+ Args:
1006
+ tensor: A tf.Tensor to check the rank of.
1007
+ expected_rank: Python integer or list of integers, expected rank.
1008
+ name: Optional name of the tensor for the error message.
1009
+
1010
+ Raises:
1011
+ ValueError: If the expected shape doesn't match the actual shape.
1012
+ """
1013
+ if name is None:
1014
+ name = tensor.name
1015
+
1016
+ expected_rank_dict = {}
1017
+ if isinstance(expected_rank, six.integer_types):
1018
+ expected_rank_dict[expected_rank] = True
1019
+ else:
1020
+ for x in expected_rank:
1021
+ expected_rank_dict[x] = True
1022
+
1023
+ actual_rank = tensor.shape.ndims
1024
+ if actual_rank not in expected_rank_dict:
1025
+ scope_name = tf.get_variable_scope().name
1026
+ raise ValueError(
1027
+ "For the tensor `%s` in scope `%s`, the actual rank "
1028
+ "`%d` (shape = %s) is not equal to the expected rank `%s`" %
1029
+ (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))