SinaTools 0.1.40__py2.py3-none-any.whl → 1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/METADATA +1 -1
  2. SinaTools-1.0.1.dist-info/RECORD +73 -0
  3. sinatools/VERSION +1 -1
  4. sinatools/ner/__init__.py +5 -7
  5. sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  6. sinatools/ner/trainers/BertTrainer.py +163 -163
  7. sinatools/ner/trainers/__init__.py +2 -2
  8. SinaTools-0.1.40.dist-info/RECORD +0 -123
  9. sinatools/arabert/arabert/__init__.py +0 -14
  10. sinatools/arabert/arabert/create_classification_data.py +0 -260
  11. sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  12. sinatools/arabert/arabert/extract_features.py +0 -444
  13. sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  14. sinatools/arabert/arabert/modeling.py +0 -1027
  15. sinatools/arabert/arabert/optimization.py +0 -202
  16. sinatools/arabert/arabert/run_classifier.py +0 -1078
  17. sinatools/arabert/arabert/run_pretraining.py +0 -593
  18. sinatools/arabert/arabert/run_squad.py +0 -1440
  19. sinatools/arabert/arabert/tokenization.py +0 -414
  20. sinatools/arabert/araelectra/__init__.py +0 -1
  21. sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  22. sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  23. sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  24. sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  25. sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  26. sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  27. sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  28. sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  29. sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  30. sinatools/arabert/araelectra/finetune/task.py +0 -74
  31. sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  32. sinatools/arabert/araelectra/flops_computation.py +0 -215
  33. sinatools/arabert/araelectra/model/__init__.py +0 -14
  34. sinatools/arabert/araelectra/model/modeling.py +0 -1029
  35. sinatools/arabert/araelectra/model/optimization.py +0 -193
  36. sinatools/arabert/araelectra/model/tokenization.py +0 -355
  37. sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  38. sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  39. sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  40. sinatools/arabert/araelectra/run_finetuning.py +0 -323
  41. sinatools/arabert/araelectra/run_pretraining.py +0 -469
  42. sinatools/arabert/araelectra/util/__init__.py +0 -14
  43. sinatools/arabert/araelectra/util/training_utils.py +0 -112
  44. sinatools/arabert/araelectra/util/utils.py +0 -109
  45. sinatools/arabert/aragpt2/__init__.py +0 -2
  46. sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  47. sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  48. sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  49. sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  50. sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  51. sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  52. sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  53. sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  54. sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  55. sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  56. sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  57. sinatools/arabert/aragpt2/grover/utils.py +0 -234
  58. sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  59. {SinaTools-0.1.40.data → SinaTools-1.0.1.data}/data/sinatools/environment.yml +0 -0
  60. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/AUTHORS.rst +0 -0
  61. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/LICENSE +0 -0
  62. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/WHEEL +0 -0
  63. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {SinaTools-0.1.40.dist-info → SinaTools-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,1027 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2018 The Google AI Language Team Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """The main BERT model and related functions."""
16
-
17
- from __future__ import absolute_import
18
- from __future__ import division
19
- from __future__ import print_function
20
-
21
- import collections
22
- import copy
23
- import json
24
- import math
25
- import re
26
- import numpy as np
27
- import six
28
- import tensorflow as tf
29
-
30
-
31
- class BertConfig(object):
32
- """Configuration for `BertModel`."""
33
-
34
- def __init__(
35
- self,
36
- vocab_size,
37
- hidden_size=768,
38
- num_hidden_layers=12,
39
- num_attention_heads=12,
40
- intermediate_size=3072,
41
- hidden_act="gelu",
42
- hidden_dropout_prob=0.1,
43
- attention_probs_dropout_prob=0.1,
44
- max_position_embeddings=512,
45
- type_vocab_size=16,
46
- initializer_range=0.02,
47
- ):
48
- """Constructs BertConfig.
49
-
50
- Args:
51
- vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
52
- hidden_size: Size of the encoder layers and the pooler layer.
53
- num_hidden_layers: Number of hidden layers in the Transformer encoder.
54
- num_attention_heads: Number of attention heads for each attention layer in
55
- the Transformer encoder.
56
- intermediate_size: The size of the "intermediate" (i.e., feed-forward)
57
- layer in the Transformer encoder.
58
- hidden_act: The non-linear activation function (function or string) in the
59
- encoder and pooler.
60
- hidden_dropout_prob: The dropout probability for all fully connected
61
- layers in the embeddings, encoder, and pooler.
62
- attention_probs_dropout_prob: The dropout ratio for the attention
63
- probabilities.
64
- max_position_embeddings: The maximum sequence length that this model might
65
- ever be used with. Typically set this to something large just in case
66
- (e.g., 512 or 1024 or 2048).
67
- type_vocab_size: The vocabulary size of the `token_type_ids` passed into
68
- `BertModel`.
69
- initializer_range: The stdev of the truncated_normal_initializer for
70
- initializing all weight matrices.
71
- """
72
- self.vocab_size = vocab_size
73
- self.hidden_size = hidden_size
74
- self.num_hidden_layers = num_hidden_layers
75
- self.num_attention_heads = num_attention_heads
76
- self.hidden_act = hidden_act
77
- self.intermediate_size = intermediate_size
78
- self.hidden_dropout_prob = hidden_dropout_prob
79
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
80
- self.max_position_embeddings = max_position_embeddings
81
- self.type_vocab_size = type_vocab_size
82
- self.initializer_range = initializer_range
83
-
84
- @classmethod
85
- def from_dict(cls, json_object):
86
- """Constructs a `BertConfig` from a Python dictionary of parameters."""
87
- config = BertConfig(vocab_size=None)
88
- for (key, value) in six.iteritems(json_object):
89
- config.__dict__[key] = value
90
- return config
91
-
92
- @classmethod
93
- def from_json_file(cls, json_file):
94
- """Constructs a `BertConfig` from a json file of parameters."""
95
- with tf.gfile.GFile(json_file, "r") as reader:
96
- text = reader.read()
97
- return cls.from_dict(json.loads(text))
98
-
99
- def to_dict(self):
100
- """Serializes this instance to a Python dictionary."""
101
- output = copy.deepcopy(self.__dict__)
102
- return output
103
-
104
- def to_json_string(self):
105
- """Serializes this instance to a JSON string."""
106
- return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
107
-
108
-
109
- class BertModel(object):
110
- """BERT model ("Bidirectional Encoder Representations from Transformers").
111
-
112
- Example usage:
113
-
114
- ```python
115
- # Already been converted into WordPiece token ids
116
- input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
117
- input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
118
- token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
119
-
120
- config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
121
- num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
122
-
123
- model = modeling.BertModel(config=config, is_training=True,
124
- input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
125
-
126
- label_embeddings = tf.get_variable(...)
127
- pooled_output = model.get_pooled_output()
128
- logits = tf.matmul(pooled_output, label_embeddings)
129
- ...
130
- ```
131
- """
132
-
133
- def __init__(
134
- self,
135
- config,
136
- is_training,
137
- input_ids,
138
- input_mask=None,
139
- token_type_ids=None,
140
- use_one_hot_embeddings=False,
141
- scope=None,
142
- ):
143
- """Constructor for BertModel.
144
-
145
- Args:
146
- config: `BertConfig` instance.
147
- is_training: bool. true for training model, false for eval model. Controls
148
- whether dropout will be applied.
149
- input_ids: int32 Tensor of shape [batch_size, seq_length].
150
- input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
151
- token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
152
- use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
153
- embeddings or tf.embedding_lookup() for the word embeddings.
154
- scope: (optional) variable scope. Defaults to "bert".
155
-
156
- Raises:
157
- ValueError: The config is invalid or one of the input tensor shapes
158
- is invalid.
159
- """
160
- config = copy.deepcopy(config)
161
- if not is_training:
162
- config.hidden_dropout_prob = 0.0
163
- config.attention_probs_dropout_prob = 0.0
164
-
165
- input_shape = get_shape_list(input_ids, expected_rank=2)
166
- batch_size = input_shape[0]
167
- seq_length = input_shape[1]
168
-
169
- if input_mask is None:
170
- input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
171
-
172
- if token_type_ids is None:
173
- token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
174
-
175
- with tf.variable_scope(scope, default_name="bert"):
176
- with tf.variable_scope("embeddings"):
177
- # Perform embedding lookup on the word ids.
178
- (self.embedding_output, self.embedding_table) = embedding_lookup(
179
- input_ids=input_ids,
180
- vocab_size=config.vocab_size,
181
- embedding_size=config.hidden_size,
182
- initializer_range=config.initializer_range,
183
- word_embedding_name="word_embeddings",
184
- use_one_hot_embeddings=use_one_hot_embeddings,
185
- )
186
-
187
- # Add positional embeddings and token type embeddings, then layer
188
- # normalize and perform dropout.
189
- self.embedding_output = embedding_postprocessor(
190
- input_tensor=self.embedding_output,
191
- use_token_type=True,
192
- token_type_ids=token_type_ids,
193
- token_type_vocab_size=config.type_vocab_size,
194
- token_type_embedding_name="token_type_embeddings",
195
- use_position_embeddings=True,
196
- position_embedding_name="position_embeddings",
197
- initializer_range=config.initializer_range,
198
- max_position_embeddings=config.max_position_embeddings,
199
- dropout_prob=config.hidden_dropout_prob,
200
- )
201
-
202
- with tf.variable_scope("encoder"):
203
- # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
204
- # mask of shape [batch_size, seq_length, seq_length] which is used
205
- # for the attention scores.
206
- attention_mask = create_attention_mask_from_input_mask(
207
- input_ids, input_mask
208
- )
209
-
210
- # Run the stacked transformer.
211
- # `sequence_output` shape = [batch_size, seq_length, hidden_size].
212
- self.all_encoder_layers = transformer_model(
213
- input_tensor=self.embedding_output,
214
- attention_mask=attention_mask,
215
- hidden_size=config.hidden_size,
216
- num_hidden_layers=config.num_hidden_layers,
217
- num_attention_heads=config.num_attention_heads,
218
- intermediate_size=config.intermediate_size,
219
- intermediate_act_fn=get_activation(config.hidden_act),
220
- hidden_dropout_prob=config.hidden_dropout_prob,
221
- attention_probs_dropout_prob=config.attention_probs_dropout_prob,
222
- initializer_range=config.initializer_range,
223
- do_return_all_layers=True,
224
- )
225
-
226
- self.sequence_output = self.all_encoder_layers[-1]
227
- # The "pooler" converts the encoded sequence tensor of shape
228
- # [batch_size, seq_length, hidden_size] to a tensor of shape
229
- # [batch_size, hidden_size]. This is necessary for segment-level
230
- # (or segment-pair-level) classification tasks where we need a fixed
231
- # dimensional representation of the segment.
232
- with tf.variable_scope("pooler"):
233
- # We "pool" the model by simply taking the hidden state corresponding
234
- # to the first token. We assume that this has been pre-trained
235
- first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
236
- self.pooled_output = tf.layers.dense(
237
- first_token_tensor,
238
- config.hidden_size,
239
- activation=tf.tanh,
240
- kernel_initializer=create_initializer(config.initializer_range),
241
- )
242
-
243
- def get_pooled_output(self):
244
- return self.pooled_output
245
-
246
- def get_sequence_output(self):
247
- """Gets final hidden layer of encoder.
248
-
249
- Returns:
250
- float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
251
- to the final hidden of the transformer encoder.
252
- """
253
- return self.sequence_output
254
-
255
- def get_all_encoder_layers(self):
256
- return self.all_encoder_layers
257
-
258
- def get_embedding_output(self):
259
- """Gets output of the embedding lookup (i.e., input to the transformer).
260
-
261
- Returns:
262
- float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
263
- to the output of the embedding layer, after summing the word
264
- embeddings with the positional embeddings and the token type embeddings,
265
- then performing layer normalization. This is the input to the transformer.
266
- """
267
- return self.embedding_output
268
-
269
- def get_embedding_table(self):
270
- return self.embedding_table
271
-
272
-
273
- def gelu(x):
274
- """Gaussian Error Linear Unit.
275
-
276
- This is a smoother version of the RELU.
277
- Original paper: https://arxiv.org/abs/1606.08415
278
- Args:
279
- x: float Tensor to perform activation.
280
-
281
- Returns:
282
- `x` with the GELU activation applied.
283
- """
284
- cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
285
- return x * cdf
286
-
287
-
288
- def get_activation(activation_string):
289
- """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
290
-
291
- Args:
292
- activation_string: String name of the activation function.
293
-
294
- Returns:
295
- A Python function corresponding to the activation function. If
296
- `activation_string` is None, empty, or "linear", this will return None.
297
- If `activation_string` is not a string, it will return `activation_string`.
298
-
299
- Raises:
300
- ValueError: The `activation_string` does not correspond to a known
301
- activation.
302
- """
303
-
304
- # We assume that anything that"s not a string is already an activation
305
- # function, so we just return it.
306
- if not isinstance(activation_string, six.string_types):
307
- return activation_string
308
-
309
- if not activation_string:
310
- return None
311
-
312
- act = activation_string.lower()
313
- if act == "linear":
314
- return None
315
- elif act == "relu":
316
- return tf.nn.relu
317
- elif act == "gelu":
318
- return gelu
319
- elif act == "tanh":
320
- return tf.tanh
321
- else:
322
- raise ValueError("Unsupported activation: %s" % act)
323
-
324
-
325
- def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
326
- """Compute the union of the current variables and checkpoint variables."""
327
- assignment_map = {}
328
- initialized_variable_names = {}
329
-
330
- name_to_variable = collections.OrderedDict()
331
- for var in tvars:
332
- name = var.name
333
- m = re.match("^(.*):\\d+$", name)
334
- if m is not None:
335
- name = m.group(1)
336
- name_to_variable[name] = var
337
-
338
- init_vars = tf.train.list_variables(init_checkpoint)
339
-
340
- assignment_map = collections.OrderedDict()
341
- for x in init_vars:
342
- (name, var) = (x[0], x[1])
343
- if name not in name_to_variable:
344
- continue
345
- assignment_map[name] = name
346
- initialized_variable_names[name] = 1
347
- initialized_variable_names[name + ":0"] = 1
348
-
349
- return (assignment_map, initialized_variable_names)
350
-
351
-
352
- def dropout(input_tensor, dropout_prob):
353
- """Perform dropout.
354
-
355
- Args:
356
- input_tensor: float Tensor.
357
- dropout_prob: Python float. The probability of dropping out a value (NOT of
358
- *keeping* a dimension as in `tf.nn.dropout`).
359
-
360
- Returns:
361
- A version of `input_tensor` with dropout applied.
362
- """
363
- if dropout_prob is None or dropout_prob == 0.0:
364
- return input_tensor
365
-
366
- output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
367
- return output
368
-
369
-
370
- def layer_norm(input_tensor, name=None):
371
- """Run layer normalization on the last dimension of the tensor."""
372
- return tf.contrib.layers.layer_norm(
373
- inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name
374
- )
375
-
376
-
377
- def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
378
- """Runs layer normalization followed by dropout."""
379
- output_tensor = layer_norm(input_tensor, name)
380
- output_tensor = dropout(output_tensor, dropout_prob)
381
- return output_tensor
382
-
383
-
384
- def create_initializer(initializer_range=0.02):
385
- """Creates a `truncated_normal_initializer` with the given range."""
386
- return tf.truncated_normal_initializer(stddev=initializer_range)
387
-
388
-
389
- def embedding_lookup(
390
- input_ids,
391
- vocab_size,
392
- embedding_size=128,
393
- initializer_range=0.02,
394
- word_embedding_name="word_embeddings",
395
- use_one_hot_embeddings=False,
396
- ):
397
- """Looks up words embeddings for id tensor.
398
-
399
- Args:
400
- input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
401
- ids.
402
- vocab_size: int. Size of the embedding vocabulary.
403
- embedding_size: int. Width of the word embeddings.
404
- initializer_range: float. Embedding initialization range.
405
- word_embedding_name: string. Name of the embedding table.
406
- use_one_hot_embeddings: bool. If True, use one-hot method for word
407
- embeddings. If False, use `tf.gather()`.
408
-
409
- Returns:
410
- float Tensor of shape [batch_size, seq_length, embedding_size].
411
- """
412
- # This function assumes that the input is of shape [batch_size, seq_length,
413
- # num_inputs].
414
- #
415
- # If the input is a 2D tensor of shape [batch_size, seq_length], we
416
- # reshape to [batch_size, seq_length, 1].
417
- if input_ids.shape.ndims == 2:
418
- input_ids = tf.expand_dims(input_ids, axis=[-1])
419
-
420
- embedding_table = tf.get_variable(
421
- name=word_embedding_name,
422
- shape=[vocab_size, embedding_size],
423
- initializer=create_initializer(initializer_range),
424
- )
425
-
426
- flat_input_ids = tf.reshape(input_ids, [-1])
427
- if use_one_hot_embeddings:
428
- one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
429
- output = tf.matmul(one_hot_input_ids, embedding_table)
430
- else:
431
- output = tf.gather(embedding_table, flat_input_ids)
432
-
433
- input_shape = get_shape_list(input_ids)
434
-
435
- output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
436
- return (output, embedding_table)
437
-
438
-
439
- def embedding_postprocessor(
440
- input_tensor,
441
- use_token_type=False,
442
- token_type_ids=None,
443
- token_type_vocab_size=16,
444
- token_type_embedding_name="token_type_embeddings",
445
- use_position_embeddings=True,
446
- position_embedding_name="position_embeddings",
447
- initializer_range=0.02,
448
- max_position_embeddings=512,
449
- dropout_prob=0.1,
450
- ):
451
- """Performs various post-processing on a word embedding tensor.
452
-
453
- Args:
454
- input_tensor: float Tensor of shape [batch_size, seq_length,
455
- embedding_size].
456
- use_token_type: bool. Whether to add embeddings for `token_type_ids`.
457
- token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
458
- Must be specified if `use_token_type` is True.
459
- token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
460
- token_type_embedding_name: string. The name of the embedding table variable
461
- for token type ids.
462
- use_position_embeddings: bool. Whether to add position embeddings for the
463
- position of each token in the sequence.
464
- position_embedding_name: string. The name of the embedding table variable
465
- for positional embeddings.
466
- initializer_range: float. Range of the weight initialization.
467
- max_position_embeddings: int. Maximum sequence length that might ever be
468
- used with this model. This can be longer than the sequence length of
469
- input_tensor, but cannot be shorter.
470
- dropout_prob: float. Dropout probability applied to the final output tensor.
471
-
472
- Returns:
473
- float tensor with same shape as `input_tensor`.
474
-
475
- Raises:
476
- ValueError: One of the tensor shapes or input values is invalid.
477
- """
478
- input_shape = get_shape_list(input_tensor, expected_rank=3)
479
- batch_size = input_shape[0]
480
- seq_length = input_shape[1]
481
- width = input_shape[2]
482
-
483
- output = input_tensor
484
-
485
- if use_token_type:
486
- if token_type_ids is None:
487
- raise ValueError(
488
- "`token_type_ids` must be specified if" "`use_token_type` is True."
489
- )
490
- token_type_table = tf.get_variable(
491
- name=token_type_embedding_name,
492
- shape=[token_type_vocab_size, width],
493
- initializer=create_initializer(initializer_range),
494
- )
495
- # This vocab will be small so we always do one-hot here, since it is always
496
- # faster for a small vocabulary.
497
- flat_token_type_ids = tf.reshape(token_type_ids, [-1])
498
- one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
499
- token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
500
- token_type_embeddings = tf.reshape(
501
- token_type_embeddings, [batch_size, seq_length, width]
502
- )
503
- output += token_type_embeddings
504
-
505
- if use_position_embeddings:
506
- assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
507
- with tf.control_dependencies([assert_op]):
508
- full_position_embeddings = tf.get_variable(
509
- name=position_embedding_name,
510
- shape=[max_position_embeddings, width],
511
- initializer=create_initializer(initializer_range),
512
- )
513
- # Since the position embedding table is a learned variable, we create it
514
- # using a (long) sequence length `max_position_embeddings`. The actual
515
- # sequence length might be shorter than this, for faster training of
516
- # tasks that do not have long sequences.
517
- #
518
- # So `full_position_embeddings` is effectively an embedding table
519
- # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
520
- # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
521
- # perform a slice.
522
- position_embeddings = tf.slice(
523
- full_position_embeddings, [0, 0], [seq_length, -1]
524
- )
525
- num_dims = len(output.shape.as_list())
526
-
527
- # Only the last two dimensions are relevant (`seq_length` and `width`), so
528
- # we broadcast among the first dimensions, which is typically just
529
- # the batch size.
530
- position_broadcast_shape = []
531
- for _ in range(num_dims - 2):
532
- position_broadcast_shape.append(1)
533
- position_broadcast_shape.extend([seq_length, width])
534
- position_embeddings = tf.reshape(
535
- position_embeddings, position_broadcast_shape
536
- )
537
- output += position_embeddings
538
-
539
- output = layer_norm_and_dropout(output, dropout_prob)
540
- return output
541
-
542
-
543
- def create_attention_mask_from_input_mask(from_tensor, to_mask):
544
- """Create 3D attention mask from a 2D tensor mask.
545
-
546
- Args:
547
- from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
548
- to_mask: int32 Tensor of shape [batch_size, to_seq_length].
549
-
550
- Returns:
551
- float Tensor of shape [batch_size, from_seq_length, to_seq_length].
552
- """
553
- from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
554
- batch_size = from_shape[0]
555
- from_seq_length = from_shape[1]
556
-
557
- to_shape = get_shape_list(to_mask, expected_rank=2)
558
- to_seq_length = to_shape[1]
559
-
560
- to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
561
-
562
- # We don't assume that `from_tensor` is a mask (although it could be). We
563
- # don't actually care if we attend *from* padding tokens (only *to* padding)
564
- # tokens so we create a tensor of all ones.
565
- #
566
- # `broadcast_ones` = [batch_size, from_seq_length, 1]
567
- broadcast_ones = tf.ones(shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
568
-
569
- # Here we broadcast along two dimensions to create the mask.
570
- mask = broadcast_ones * to_mask
571
-
572
- return mask
573
-
574
-
575
- def attention_layer(
576
- from_tensor,
577
- to_tensor,
578
- attention_mask=None,
579
- num_attention_heads=1,
580
- size_per_head=512,
581
- query_act=None,
582
- key_act=None,
583
- value_act=None,
584
- attention_probs_dropout_prob=0.0,
585
- initializer_range=0.02,
586
- do_return_2d_tensor=False,
587
- batch_size=None,
588
- from_seq_length=None,
589
- to_seq_length=None,
590
- ):
591
- """Performs multi-headed attention from `from_tensor` to `to_tensor`.
592
-
593
- This is an implementation of multi-headed attention based on "Attention
594
- is all you Need". If `from_tensor` and `to_tensor` are the same, then
595
- this is self-attention. Each timestep in `from_tensor` attends to the
596
- corresponding sequence in `to_tensor`, and returns a fixed-with vector.
597
-
598
- This function first projects `from_tensor` into a "query" tensor and
599
- `to_tensor` into "key" and "value" tensors. These are (effectively) a list
600
- of tensors of length `num_attention_heads`, where each tensor is of shape
601
- [batch_size, seq_length, size_per_head].
602
-
603
- Then, the query and key tensors are dot-producted and scaled. These are
604
- softmaxed to obtain attention probabilities. The value tensors are then
605
- interpolated by these probabilities, then concatenated back to a single
606
- tensor and returned.
607
-
608
- In practice, the multi-headed attention are done with transposes and
609
- reshapes rather than actual separate tensors.
610
-
611
- Args:
612
- from_tensor: float Tensor of shape [batch_size, from_seq_length,
613
- from_width].
614
- to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
615
- attention_mask: (optional) int32 Tensor of shape [batch_size,
616
- from_seq_length, to_seq_length]. The values should be 1 or 0. The
617
- attention scores will effectively be set to -infinity for any positions in
618
- the mask that are 0, and will be unchanged for positions that are 1.
619
- num_attention_heads: int. Number of attention heads.
620
- size_per_head: int. Size of each attention head.
621
- query_act: (optional) Activation function for the query transform.
622
- key_act: (optional) Activation function for the key transform.
623
- value_act: (optional) Activation function for the value transform.
624
- attention_probs_dropout_prob: (optional) float. Dropout probability of the
625
- attention probabilities.
626
- initializer_range: float. Range of the weight initializer.
627
- do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
628
- * from_seq_length, num_attention_heads * size_per_head]. If False, the
629
- output will be of shape [batch_size, from_seq_length, num_attention_heads
630
- * size_per_head].
631
- batch_size: (Optional) int. If the input is 2D, this might be the batch size
632
- of the 3D version of the `from_tensor` and `to_tensor`.
633
- from_seq_length: (Optional) If the input is 2D, this might be the seq length
634
- of the 3D version of the `from_tensor`.
635
- to_seq_length: (Optional) If the input is 2D, this might be the seq length
636
- of the 3D version of the `to_tensor`.
637
-
638
- Returns:
639
- float Tensor of shape [batch_size, from_seq_length,
640
- num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
641
- true, this will be of shape [batch_size * from_seq_length,
642
- num_attention_heads * size_per_head]).
643
-
644
- Raises:
645
- ValueError: Any of the arguments or tensor shapes are invalid.
646
- """
647
-
648
- def transpose_for_scores(
649
- input_tensor, batch_size, num_attention_heads, seq_length, width
650
- ):
651
- output_tensor = tf.reshape(
652
- input_tensor, [batch_size, seq_length, num_attention_heads, width]
653
- )
654
-
655
- output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
656
- return output_tensor
657
-
658
- from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
659
- to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
660
-
661
- if len(from_shape) != len(to_shape):
662
- raise ValueError(
663
- "The rank of `from_tensor` must match the rank of `to_tensor`."
664
- )
665
-
666
- if len(from_shape) == 3:
667
- batch_size = from_shape[0]
668
- from_seq_length = from_shape[1]
669
- to_seq_length = to_shape[1]
670
- elif len(from_shape) == 2:
671
- if batch_size is None or from_seq_length is None or to_seq_length is None:
672
- raise ValueError(
673
- "When passing in rank 2 tensors to attention_layer, the values "
674
- "for `batch_size`, `from_seq_length`, and `to_seq_length` "
675
- "must all be specified."
676
- )
677
-
678
- # Scalar dimensions referenced here:
679
- # B = batch size (number of sequences)
680
- # F = `from_tensor` sequence length
681
- # T = `to_tensor` sequence length
682
- # N = `num_attention_heads`
683
- # H = `size_per_head`
684
-
685
- from_tensor_2d = reshape_to_matrix(from_tensor)
686
- to_tensor_2d = reshape_to_matrix(to_tensor)
687
-
688
- # `query_layer` = [B*F, N*H]
689
- query_layer = tf.layers.dense(
690
- from_tensor_2d,
691
- num_attention_heads * size_per_head,
692
- activation=query_act,
693
- name="query",
694
- kernel_initializer=create_initializer(initializer_range),
695
- )
696
-
697
- # `key_layer` = [B*T, N*H]
698
- key_layer = tf.layers.dense(
699
- to_tensor_2d,
700
- num_attention_heads * size_per_head,
701
- activation=key_act,
702
- name="key",
703
- kernel_initializer=create_initializer(initializer_range),
704
- )
705
-
706
- # `value_layer` = [B*T, N*H]
707
- value_layer = tf.layers.dense(
708
- to_tensor_2d,
709
- num_attention_heads * size_per_head,
710
- activation=value_act,
711
- name="value",
712
- kernel_initializer=create_initializer(initializer_range),
713
- )
714
-
715
- # `query_layer` = [B, N, F, H]
716
- query_layer = transpose_for_scores(
717
- query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head
718
- )
719
-
720
- # `key_layer` = [B, N, T, H]
721
- key_layer = transpose_for_scores(
722
- key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head
723
- )
724
-
725
- # Take the dot product between "query" and "key" to get the raw
726
- # attention scores.
727
- # `attention_scores` = [B, N, F, T]
728
- attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
729
- attention_scores = tf.multiply(
730
- attention_scores, 1.0 / math.sqrt(float(size_per_head))
731
- )
732
-
733
- if attention_mask is not None:
734
- # `attention_mask` = [B, 1, F, T]
735
- attention_mask = tf.expand_dims(attention_mask, axis=[1])
736
-
737
- # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
738
- # masked positions, this operation will create a tensor which is 0.0 for
739
- # positions we want to attend and -10000.0 for masked positions.
740
- adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
741
-
742
- # Since we are adding it to the raw scores before the softmax, this is
743
- # effectively the same as removing these entirely.
744
- attention_scores += adder
745
-
746
- # Normalize the attention scores to probabilities.
747
- # `attention_probs` = [B, N, F, T]
748
- attention_probs = tf.nn.softmax(attention_scores)
749
-
750
- # This is actually dropping out entire tokens to attend to, which might
751
- # seem a bit unusual, but is taken from the original Transformer paper.
752
- attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
753
-
754
- # `value_layer` = [B, T, N, H]
755
- value_layer = tf.reshape(
756
- value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head]
757
- )
758
-
759
- # `value_layer` = [B, N, T, H]
760
- value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
761
-
762
- # `context_layer` = [B, N, F, H]
763
- context_layer = tf.matmul(attention_probs, value_layer)
764
-
765
- # `context_layer` = [B, F, N, H]
766
- context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
767
-
768
- if do_return_2d_tensor:
769
- # `context_layer` = [B*F, N*H]
770
- context_layer = tf.reshape(
771
- context_layer,
772
- [batch_size * from_seq_length, num_attention_heads * size_per_head],
773
- )
774
- else:
775
- # `context_layer` = [B, F, N*H]
776
- context_layer = tf.reshape(
777
- context_layer,
778
- [batch_size, from_seq_length, num_attention_heads * size_per_head],
779
- )
780
-
781
- return context_layer
782
-
783
-
784
- def transformer_model(
785
- input_tensor,
786
- attention_mask=None,
787
- hidden_size=768,
788
- num_hidden_layers=12,
789
- num_attention_heads=12,
790
- intermediate_size=3072,
791
- intermediate_act_fn=gelu,
792
- hidden_dropout_prob=0.1,
793
- attention_probs_dropout_prob=0.1,
794
- initializer_range=0.02,
795
- do_return_all_layers=False,
796
- ):
797
- """Multi-headed, multi-layer Transformer from "Attention is All You Need".
798
-
799
- This is almost an exact implementation of the original Transformer encoder.
800
-
801
- See the original paper:
802
- https://arxiv.org/abs/1706.03762
803
-
804
- Also see:
805
- https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
806
-
807
- Args:
808
- input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
809
- attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
810
- seq_length], with 1 for positions that can be attended to and 0 in
811
- positions that should not be.
812
- hidden_size: int. Hidden size of the Transformer.
813
- num_hidden_layers: int. Number of layers (blocks) in the Transformer.
814
- num_attention_heads: int. Number of attention heads in the Transformer.
815
- intermediate_size: int. The size of the "intermediate" (a.k.a., feed
816
- forward) layer.
817
- intermediate_act_fn: function. The non-linear activation function to apply
818
- to the output of the intermediate/feed-forward layer.
819
- hidden_dropout_prob: float. Dropout probability for the hidden layers.
820
- attention_probs_dropout_prob: float. Dropout probability of the attention
821
- probabilities.
822
- initializer_range: float. Range of the initializer (stddev of truncated
823
- normal).
824
- do_return_all_layers: Whether to also return all layers or just the final
825
- layer.
826
-
827
- Returns:
828
- float Tensor of shape [batch_size, seq_length, hidden_size], the final
829
- hidden layer of the Transformer.
830
-
831
- Raises:
832
- ValueError: A Tensor shape or parameter is invalid.
833
- """
834
- if hidden_size % num_attention_heads != 0:
835
- raise ValueError(
836
- "The hidden size (%d) is not a multiple of the number of attention "
837
- "heads (%d)" % (hidden_size, num_attention_heads)
838
- )
839
-
840
- attention_head_size = int(hidden_size / num_attention_heads)
841
- input_shape = get_shape_list(input_tensor, expected_rank=3)
842
- batch_size = input_shape[0]
843
- seq_length = input_shape[1]
844
- input_width = input_shape[2]
845
-
846
- # The Transformer performs sum residuals on all layers so the input needs
847
- # to be the same as the hidden size.
848
- if input_width != hidden_size:
849
- raise ValueError(
850
- "The width of the input tensor (%d) != hidden size (%d)"
851
- % (input_width, hidden_size)
852
- )
853
-
854
- # We keep the representation as a 2D tensor to avoid re-shaping it back and
855
- # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
856
- # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
857
- # help the optimizer.
858
- prev_output = reshape_to_matrix(input_tensor)
859
-
860
- all_layer_outputs = []
861
- for layer_idx in range(num_hidden_layers):
862
- with tf.variable_scope("layer_%d" % layer_idx):
863
- layer_input = prev_output
864
-
865
- with tf.variable_scope("attention"):
866
- attention_heads = []
867
- with tf.variable_scope("self"):
868
- attention_head = attention_layer(
869
- from_tensor=layer_input,
870
- to_tensor=layer_input,
871
- attention_mask=attention_mask,
872
- num_attention_heads=num_attention_heads,
873
- size_per_head=attention_head_size,
874
- attention_probs_dropout_prob=attention_probs_dropout_prob,
875
- initializer_range=initializer_range,
876
- do_return_2d_tensor=True,
877
- batch_size=batch_size,
878
- from_seq_length=seq_length,
879
- to_seq_length=seq_length,
880
- )
881
- attention_heads.append(attention_head)
882
-
883
- attention_output = None
884
- if len(attention_heads) == 1:
885
- attention_output = attention_heads[0]
886
- else:
887
- # In the case where we have other sequences, we just concatenate
888
- # them to the self-attention head before the projection.
889
- attention_output = tf.concat(attention_heads, axis=-1)
890
-
891
- # Run a linear projection of `hidden_size` then add a residual
892
- # with `layer_input`.
893
- with tf.variable_scope("output"):
894
- attention_output = tf.layers.dense(
895
- attention_output,
896
- hidden_size,
897
- kernel_initializer=create_initializer(initializer_range),
898
- )
899
- attention_output = dropout(attention_output, hidden_dropout_prob)
900
- attention_output = layer_norm(attention_output + layer_input)
901
-
902
- # The activation is only applied to the "intermediate" hidden layer.
903
- with tf.variable_scope("intermediate"):
904
- intermediate_output = tf.layers.dense(
905
- attention_output,
906
- intermediate_size,
907
- activation=intermediate_act_fn,
908
- kernel_initializer=create_initializer(initializer_range),
909
- )
910
-
911
- # Down-project back to `hidden_size` then add the residual.
912
- with tf.variable_scope("output"):
913
- layer_output = tf.layers.dense(
914
- intermediate_output,
915
- hidden_size,
916
- kernel_initializer=create_initializer(initializer_range),
917
- )
918
- layer_output = dropout(layer_output, hidden_dropout_prob)
919
- layer_output = layer_norm(layer_output + attention_output)
920
- prev_output = layer_output
921
- all_layer_outputs.append(layer_output)
922
-
923
- if do_return_all_layers:
924
- final_outputs = []
925
- for layer_output in all_layer_outputs:
926
- final_output = reshape_from_matrix(layer_output, input_shape)
927
- final_outputs.append(final_output)
928
- return final_outputs
929
- else:
930
- final_output = reshape_from_matrix(prev_output, input_shape)
931
- return final_output
932
-
933
-
934
- def get_shape_list(tensor, expected_rank=None, name=None):
935
- """Returns a list of the shape of tensor, preferring static dimensions.
936
-
937
- Args:
938
- tensor: A tf.Tensor object to find the shape of.
939
- expected_rank: (optional) int. The expected rank of `tensor`. If this is
940
- specified and the `tensor` has a different rank, and exception will be
941
- thrown.
942
- name: Optional name of the tensor for the error message.
943
-
944
- Returns:
945
- A list of dimensions of the shape of tensor. All static dimensions will
946
- be returned as python integers, and dynamic dimensions will be returned
947
- as tf.Tensor scalars.
948
- """
949
- if name is None:
950
- name = tensor.name
951
-
952
- if expected_rank is not None:
953
- assert_rank(tensor, expected_rank, name)
954
-
955
- shape = tensor.shape.as_list()
956
-
957
- non_static_indexes = []
958
- for (index, dim) in enumerate(shape):
959
- if dim is None:
960
- non_static_indexes.append(index)
961
-
962
- if not non_static_indexes:
963
- return shape
964
-
965
- dyn_shape = tf.shape(tensor)
966
- for index in non_static_indexes:
967
- shape[index] = dyn_shape[index]
968
- return shape
969
-
970
-
971
- def reshape_to_matrix(input_tensor):
972
- """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
973
- ndims = input_tensor.shape.ndims
974
- if ndims < 2:
975
- raise ValueError(
976
- "Input tensor must have at least rank 2. Shape = %s" % (input_tensor.shape)
977
- )
978
- if ndims == 2:
979
- return input_tensor
980
-
981
- width = input_tensor.shape[-1]
982
- output_tensor = tf.reshape(input_tensor, [-1, width])
983
- return output_tensor
984
-
985
-
986
- def reshape_from_matrix(output_tensor, orig_shape_list):
987
- """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
988
- if len(orig_shape_list) == 2:
989
- return output_tensor
990
-
991
- output_shape = get_shape_list(output_tensor)
992
-
993
- orig_dims = orig_shape_list[0:-1]
994
- width = output_shape[-1]
995
-
996
- return tf.reshape(output_tensor, orig_dims + [width])
997
-
998
-
999
- def assert_rank(tensor, expected_rank, name=None):
1000
- """Raises an exception if the tensor rank is not of the expected rank.
1001
-
1002
- Args:
1003
- tensor: A tf.Tensor to check the rank of.
1004
- expected_rank: Python integer or list of integers, expected rank.
1005
- name: Optional name of the tensor for the error message.
1006
-
1007
- Raises:
1008
- ValueError: If the expected shape doesn't match the actual shape.
1009
- """
1010
- if name is None:
1011
- name = tensor.name
1012
-
1013
- expected_rank_dict = {}
1014
- if isinstance(expected_rank, six.integer_types):
1015
- expected_rank_dict[expected_rank] = True
1016
- else:
1017
- for x in expected_rank:
1018
- expected_rank_dict[x] = True
1019
-
1020
- actual_rank = tensor.shape.ndims
1021
- if actual_rank not in expected_rank_dict:
1022
- scope_name = tf.get_variable_scope().name
1023
- raise ValueError(
1024
- "For the tensor `%s` in scope `%s`, the actual rank "
1025
- "`%d` (shape = %s) is not equal to the expected rank `%s`"
1026
- % (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))
1027
- )