tf-models-nightly 2.17.0.dev20240610__py2.py3-none-any.whl → 2.17.0.dev20240612__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- official/nlp/modeling/layers/__init__.py +2 -0
- official/nlp/modeling/layers/pack_optimization.py +8 -0
- official/nlp/modeling/layers/rezero_transformer.py +69 -20
- official/nlp/modeling/layers/rezero_transformer_test.py +63 -0
- official/nlp/modeling/layers/transformer_encoder_block.py +47 -7
- official/nlp/modeling/layers/transformer_encoder_block_test.py +78 -0
- {tf_models_nightly-2.17.0.dev20240610.dist-info → tf_models_nightly-2.17.0.dev20240612.dist-info}/METADATA +1 -1
- {tf_models_nightly-2.17.0.dev20240610.dist-info → tf_models_nightly-2.17.0.dev20240612.dist-info}/RECORD +12 -12
- {tf_models_nightly-2.17.0.dev20240610.dist-info → tf_models_nightly-2.17.0.dev20240612.dist-info}/AUTHORS +0 -0
- {tf_models_nightly-2.17.0.dev20240610.dist-info → tf_models_nightly-2.17.0.dev20240612.dist-info}/LICENSE +0 -0
- {tf_models_nightly-2.17.0.dev20240610.dist-info → tf_models_nightly-2.17.0.dev20240612.dist-info}/WHEEL +0 -0
- {tf_models_nightly-2.17.0.dev20240610.dist-info → tf_models_nightly-2.17.0.dev20240612.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ from official.nlp.modeling.layers.attention import *
|
|
23
23
|
from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
|
24
24
|
from official.nlp.modeling.layers.bigbird_attention import BigBirdMasks
|
25
25
|
from official.nlp.modeling.layers.block_diag_feedforward import BlockDiagFeedforward
|
26
|
+
from official.nlp.modeling.layers.block_sparse_attention import MultiHeadAttention as BlockSparseAttention
|
26
27
|
from official.nlp.modeling.layers.cls_head import *
|
27
28
|
from official.nlp.modeling.layers.factorized_embedding import FactorizedEmbedding
|
28
29
|
from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
|
@@ -44,6 +45,7 @@ from official.nlp.modeling.layers.moe import FeedForwardExperts
|
|
44
45
|
from official.nlp.modeling.layers.moe import MoeLayer
|
45
46
|
from official.nlp.modeling.layers.moe import MoeLayerWithBackbone
|
46
47
|
from official.nlp.modeling.layers.multi_channel_attention import *
|
48
|
+
from official.nlp.modeling.layers.multi_query_attention import MultiHeadAttention as MultiQueryAttention
|
47
49
|
from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
|
48
50
|
from official.nlp.modeling.layers.pack_optimization import PackBertEmbeddings
|
49
51
|
from official.nlp.modeling.layers.pack_optimization import StridedReZeroTransformer
|
@@ -72,6 +72,10 @@ class StridedTransformerEncoderBlock(
|
|
72
72
|
if self._output_range is not None:
|
73
73
|
raise ValueError('StridedTransformerEncoderBlock does not '
|
74
74
|
'support `output_range` argument.')
|
75
|
+
# TODO(b/337888023): Support block sparse attention with strided inputs.
|
76
|
+
if self._src_block_size is not None:
|
77
|
+
raise ValueError('StridedTransformerEncoderBlock does not '
|
78
|
+
'support block sparse attention.')
|
75
79
|
|
76
80
|
def call(self, inputs, stride: tf.Tensor):
|
77
81
|
if isinstance(inputs, (list, tuple)):
|
@@ -137,6 +141,10 @@ class StridedReZeroTransformer(rezero_transformer.ReZeroTransformer):
|
|
137
141
|
if self._output_range is not None:
|
138
142
|
raise ValueError(f'{self.__class__} does not '
|
139
143
|
'support `output_range` argument.')
|
144
|
+
# TODO(b/337888023): Support block sparse attention with strided inputs.
|
145
|
+
if self._src_block_size is not None:
|
146
|
+
raise ValueError(f'{self.__class__} does not '
|
147
|
+
'support block sparse attention.')
|
140
148
|
|
141
149
|
def call(self, inputs, stride: tf.Tensor):
|
142
150
|
if isinstance(inputs, (list, tuple)):
|
@@ -21,6 +21,8 @@ import gin
|
|
21
21
|
import tensorflow as tf, tf_keras
|
22
22
|
|
23
23
|
from official.modeling import tf_utils
|
24
|
+
from official.nlp.modeling.layers import block_sparse_attention
|
25
|
+
from official.nlp.modeling.layers import multi_query_attention
|
24
26
|
from official.nlp.modeling.layers import util
|
25
27
|
|
26
28
|
|
@@ -53,6 +55,12 @@ class ReZeroTransformer(tf_keras.layers.Layer):
|
|
53
55
|
bias_constraint: Constraint for dense layer kernels.
|
54
56
|
use_layer_norm: If add layer_norm on top of the ReZero.
|
55
57
|
share_rezero: If attention layer and FFN layer share the same alpha.
|
58
|
+
num_kv_heads: Number of key-value heads for multi-query attention. Refer to
|
59
|
+
`multi_query_attention.MultiHeadAttention` for more details.
|
60
|
+
src_block_size: Source block size. Refer to
|
61
|
+
`block_sparse_attention.MultiHeadAttention` for more details.
|
62
|
+
tgt_block_size: Target block size. Refer to
|
63
|
+
`block_sparse_attention.MultiHeadAttention` for more details.
|
56
64
|
"""
|
57
65
|
|
58
66
|
def __init__(self,
|
@@ -71,6 +79,9 @@ class ReZeroTransformer(tf_keras.layers.Layer):
|
|
71
79
|
bias_constraint=None,
|
72
80
|
use_layer_norm=False,
|
73
81
|
share_rezero=True,
|
82
|
+
num_kv_heads=None,
|
83
|
+
src_block_size=None,
|
84
|
+
tgt_block_size=None,
|
74
85
|
**kwargs):
|
75
86
|
# attention_dropout will override attention_dropout_rate.
|
76
87
|
# This is to unify the input params with TransformerEncoderBlock.
|
@@ -101,6 +112,14 @@ class ReZeroTransformer(tf_keras.layers.Layer):
|
|
101
112
|
self._bias_constraint = tf_keras.constraints.get(bias_constraint)
|
102
113
|
self._use_layer_norm = use_layer_norm
|
103
114
|
self._share_rezero = share_rezero
|
115
|
+
self._num_kv_heads = num_kv_heads
|
116
|
+
self._src_block_size = src_block_size
|
117
|
+
self._tgt_block_size = tgt_block_size
|
118
|
+
if self._num_kv_heads is not None and self._src_block_size is not None:
|
119
|
+
raise ValueError(
|
120
|
+
"Block sparse attention does not support Multi-query attention."
|
121
|
+
" Specify only one of them."
|
122
|
+
)
|
104
123
|
|
105
124
|
def build(self, input_shape):
|
106
125
|
if isinstance(input_shape, tf.TensorShape):
|
@@ -109,53 +128,77 @@ class ReZeroTransformer(tf_keras.layers.Layer):
|
|
109
128
|
input_tensor_shape = tf.TensorShape(input_shape[0])
|
110
129
|
else:
|
111
130
|
raise ValueError(
|
112
|
-
"The type of input shape argument is not supported, got: %s"
|
113
|
-
type(input_shape)
|
131
|
+
"The type of input shape argument is not supported, got: %s"
|
132
|
+
% type(input_shape)
|
133
|
+
)
|
114
134
|
|
115
135
|
if len(input_tensor_shape.as_list()) != 3:
|
116
|
-
raise ValueError(
|
117
|
-
|
136
|
+
raise ValueError(
|
137
|
+
"TransformerLayer expects a three-dimensional input of "
|
138
|
+
"shape [batch, sequence, width]."
|
139
|
+
)
|
118
140
|
batch_size, sequence_length, hidden_size = input_tensor_shape
|
119
141
|
|
120
142
|
if len(input_shape) == 2:
|
121
143
|
mask_tensor_shape = tf.TensorShape(input_shape[1])
|
122
144
|
expected_mask_tensor_shape = tf.TensorShape(
|
123
|
-
[batch_size, sequence_length, sequence_length]
|
145
|
+
[batch_size, sequence_length, sequence_length]
|
146
|
+
)
|
124
147
|
if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
|
125
|
-
raise ValueError(
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
148
|
+
raise ValueError(
|
149
|
+
"When passing a mask tensor to TransformerLayer, the "
|
150
|
+
"mask tensor must be of shape [batch, "
|
151
|
+
"sequence_length, sequence_length] (here %s). Got a "
|
152
|
+
"mask tensor of shape %s."
|
153
|
+
% (expected_mask_tensor_shape, mask_tensor_shape)
|
154
|
+
)
|
130
155
|
if hidden_size % self._num_heads != 0:
|
131
156
|
raise ValueError(
|
132
157
|
"The input size (%d) is not a multiple of the number of attention "
|
133
|
-
"heads (%d)" % (hidden_size, self._num_heads)
|
158
|
+
"heads (%d)" % (hidden_size, self._num_heads)
|
159
|
+
)
|
134
160
|
self._attention_head_size = int(hidden_size // self._num_heads)
|
135
161
|
common_kwargs = dict(
|
136
162
|
kernel_regularizer=self._kernel_regularizer,
|
137
163
|
bias_regularizer=self._bias_regularizer,
|
138
164
|
activity_regularizer=self._activity_regularizer,
|
139
165
|
kernel_constraint=self._kernel_constraint,
|
140
|
-
bias_constraint=self._bias_constraint
|
141
|
-
|
166
|
+
bias_constraint=self._bias_constraint,
|
167
|
+
)
|
168
|
+
attention_kwargs = dict(
|
142
169
|
num_heads=self._num_heads,
|
143
170
|
key_dim=self._attention_head_size,
|
144
171
|
dropout=self._attention_dropout_rate,
|
145
172
|
name="self_attention",
|
146
173
|
kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
|
147
174
|
bias_initializer=tf_utils.clone_initializer(self._bias_initializer),
|
148
|
-
|
175
|
+
)
|
176
|
+
if self._src_block_size is not None:
|
177
|
+
attention_kwargs.update(
|
178
|
+
src_block_size=self._src_block_size,
|
179
|
+
tgt_block_size=self._tgt_block_size,
|
180
|
+
name="block_sparse_attention",
|
181
|
+
)
|
182
|
+
attention_fn = block_sparse_attention.MultiHeadAttention
|
183
|
+
elif self._num_kv_heads is not None:
|
184
|
+
attention_kwargs.update(
|
185
|
+
num_kv_heads=self._num_kv_heads,
|
186
|
+
name="multi_query_attention",
|
187
|
+
)
|
188
|
+
attention_fn = multi_query_attention.MultiHeadAttention
|
189
|
+
else:
|
190
|
+
attention_fn = tf_keras.layers.MultiHeadAttention
|
191
|
+
self._attention_layer = attention_fn(**attention_kwargs, **common_kwargs)
|
149
192
|
self._attention_dropout = tf_keras.layers.Dropout(rate=self._dropout_rate)
|
150
193
|
if self._use_layer_norm:
|
151
194
|
# Use float32 in layernorm for numeric stability.
|
152
195
|
# It is probably safe in mixed_float16, but we haven't validated this yet.
|
153
|
-
self._attention_layer_norm = (
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
196
|
+
self._attention_layer_norm = tf_keras.layers.LayerNormalization(
|
197
|
+
name="self_attention_layer_norm",
|
198
|
+
axis=-1,
|
199
|
+
epsilon=1e-12,
|
200
|
+
dtype=tf.float32,
|
201
|
+
)
|
159
202
|
self._intermediate_dense = tf_keras.layers.EinsumDense(
|
160
203
|
"abc,cd->abd",
|
161
204
|
output_shape=(None, self._inner_dim),
|
@@ -221,6 +264,12 @@ class ReZeroTransformer(tf_keras.layers.Layer):
|
|
221
264
|
self._use_layer_norm,
|
222
265
|
"share_rezero":
|
223
266
|
self._share_rezero,
|
267
|
+
"num_kv_heads":
|
268
|
+
self._num_kv_heads,
|
269
|
+
"src_block_size":
|
270
|
+
self._src_block_size,
|
271
|
+
"tgt_block_size":
|
272
|
+
self._tgt_block_size,
|
224
273
|
"kernel_initializer":
|
225
274
|
tf_keras.initializers.serialize(self._kernel_initializer),
|
226
275
|
"bias_initializer":
|
@@ -141,6 +141,69 @@ class TransformerWithReZeroLayerTest(tf.test.TestCase, parameterized.TestCase):
|
|
141
141
|
output = test_layer(inputs)
|
142
142
|
self.assertEqual(output.shape, q_tensor.shape)
|
143
143
|
|
144
|
+
@parameterized.named_parameters(('_mqa', 1),
|
145
|
+
('_gqa', 5))
|
146
|
+
def test_rezero_with_kv_heads(self, num_kv_heads):
|
147
|
+
tf_keras.mixed_precision.set_global_policy('mixed_float16')
|
148
|
+
test_layer = rezero_transformer.ReZeroTransformer(
|
149
|
+
num_attention_heads=10,
|
150
|
+
intermediate_size=2048,
|
151
|
+
intermediate_activation='relu',
|
152
|
+
num_kv_heads=num_kv_heads,
|
153
|
+
)
|
154
|
+
sequence_length = 21
|
155
|
+
width = 80
|
156
|
+
# Create a 3-dimensional input (the first dimension is implicit).
|
157
|
+
data_tensor = tf_keras.Input(shape=(sequence_length, width))
|
158
|
+
# Create a 2-dimensional input (the first dimension is implicit).
|
159
|
+
mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length))
|
160
|
+
output_tensor = test_layer([data_tensor, mask_tensor])
|
161
|
+
|
162
|
+
# Create a model from the test layer.
|
163
|
+
model = tf_keras.Model([data_tensor, mask_tensor], output_tensor)
|
164
|
+
|
165
|
+
# Invoke the model on test data. We can't validate the output data itself
|
166
|
+
# (the NN is too complex) but this will rule out structural runtime errors.
|
167
|
+
batch_size = 6
|
168
|
+
input_data = (10 * np.random.random_sample(
|
169
|
+
(batch_size, sequence_length, width)))
|
170
|
+
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
|
171
|
+
# which here is (batch, sequence_length, sequence_length)
|
172
|
+
mask_data = np.random.randint(
|
173
|
+
2, size=(batch_size, sequence_length, sequence_length))
|
174
|
+
_ = model.predict([input_data, mask_data])
|
175
|
+
|
176
|
+
def test_rezero_with_block_sparse_attention(self):
|
177
|
+
tf_keras.mixed_precision.set_global_policy('mixed_float16')
|
178
|
+
test_layer = rezero_transformer.ReZeroTransformer(
|
179
|
+
num_attention_heads=10,
|
180
|
+
intermediate_size=2048,
|
181
|
+
intermediate_activation='relu',
|
182
|
+
src_block_size=3,
|
183
|
+
tgt_block_size=3,
|
184
|
+
)
|
185
|
+
sequence_length = 21
|
186
|
+
width = 80
|
187
|
+
# Create a 3-dimensional input (the first dimension is implicit).
|
188
|
+
data_tensor = tf_keras.Input(shape=(sequence_length, width))
|
189
|
+
# Create a 2-dimensional input (the first dimension is implicit).
|
190
|
+
mask_tensor = tf_keras.Input(shape=(sequence_length, sequence_length))
|
191
|
+
output_tensor = test_layer([data_tensor, mask_tensor])
|
192
|
+
|
193
|
+
# Create a model from the test layer.
|
194
|
+
model = tf_keras.Model([data_tensor, mask_tensor], output_tensor)
|
195
|
+
|
196
|
+
# Invoke the model on test data. We can't validate the output data itself
|
197
|
+
# (the NN is too complex) but this will rule out structural runtime errors.
|
198
|
+
batch_size = 6
|
199
|
+
input_data = (10 * np.random.random_sample(
|
200
|
+
(batch_size, sequence_length, width)))
|
201
|
+
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
|
202
|
+
# which here is (batch, sequence_length, sequence_length)
|
203
|
+
mask_data = np.random.randint(
|
204
|
+
2, size=(batch_size, sequence_length, sequence_length))
|
205
|
+
_ = model.predict([input_data, mask_data])
|
206
|
+
|
144
207
|
|
145
208
|
if __name__ == '__main__':
|
146
209
|
tf.test.main()
|
@@ -18,6 +18,8 @@ from absl import logging
|
|
18
18
|
import tensorflow as tf, tf_keras
|
19
19
|
|
20
20
|
from official.modeling import tf_utils
|
21
|
+
from official.nlp.modeling.layers import block_sparse_attention
|
22
|
+
from official.nlp.modeling.layers import multi_query_attention
|
21
23
|
from official.nlp.modeling.layers import util
|
22
24
|
|
23
25
|
|
@@ -107,6 +109,9 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
|
|
107
109
|
output_last_dim=None,
|
108
110
|
diff_q_kv_att_layer_norm=False,
|
109
111
|
return_attention_scores=False,
|
112
|
+
num_kv_heads=None,
|
113
|
+
src_block_size=None,
|
114
|
+
tgt_block_size=None,
|
110
115
|
**kwargs):
|
111
116
|
"""Initializes `TransformerEncoderBlock`.
|
112
117
|
|
@@ -174,6 +179,12 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
|
|
174
179
|
return_attention_scores: If `True`, the output of this layer will be a
|
175
180
|
tuple and additionally contain the attention scores in the shape of
|
176
181
|
`[batch_size, num_attention_heads, seq_dim, seq_dim]`.
|
182
|
+
num_kv_heads: Number of key-value heads for multi-query attention. Refer
|
183
|
+
to `multi_query_attention.MultiHeadAttention` for more details.
|
184
|
+
src_block_size: Source block size. Refer to
|
185
|
+
`block_sparse_attention.MultiHeadAttention` for more details.
|
186
|
+
tgt_block_size: Target block size. Refer to
|
187
|
+
`block_sparse_attention.MultiHeadAttention` for more details.
|
177
188
|
**kwargs: keyword arguments.
|
178
189
|
"""
|
179
190
|
util.filter_kwargs(kwargs)
|
@@ -208,6 +219,14 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
|
|
208
219
|
self._output_last_dim = output_last_dim
|
209
220
|
self._diff_q_kv_att_layer_norm = diff_q_kv_att_layer_norm
|
210
221
|
self._return_attention_scores = return_attention_scores
|
222
|
+
self._num_kv_heads = num_kv_heads
|
223
|
+
self._src_block_size = src_block_size
|
224
|
+
self._tgt_block_size = tgt_block_size
|
225
|
+
if self._num_kv_heads is not None and self._src_block_size is not None:
|
226
|
+
raise ValueError(
|
227
|
+
"Block sparse attention does not support Multi-query attention."
|
228
|
+
" Specify only one of them."
|
229
|
+
)
|
211
230
|
if attention_initializer:
|
212
231
|
self._attention_initializer = tf_keras.initializers.get(
|
213
232
|
attention_initializer)
|
@@ -244,12 +263,7 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
|
|
244
263
|
else:
|
245
264
|
last_output_shape = self._output_last_dim
|
246
265
|
|
247
|
-
|
248
|
-
bias_regularizer=self._bias_regularizer,
|
249
|
-
activity_regularizer=self._activity_regularizer,
|
250
|
-
kernel_constraint=self._kernel_constraint,
|
251
|
-
bias_constraint=self._bias_constraint)
|
252
|
-
self._attention_layer = tf_keras.layers.MultiHeadAttention(
|
266
|
+
attention_layer_kwargs = dict(
|
253
267
|
num_heads=self._num_heads,
|
254
268
|
key_dim=self._key_dim,
|
255
269
|
value_dim=self._value_dim,
|
@@ -260,7 +274,30 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
|
|
260
274
|
attention_axes=self._attention_axes,
|
261
275
|
output_shape=self._output_last_dim,
|
262
276
|
name="self_attention",
|
263
|
-
|
277
|
+
)
|
278
|
+
common_kwargs = dict(
|
279
|
+
bias_regularizer=self._bias_regularizer,
|
280
|
+
activity_regularizer=self._activity_regularizer,
|
281
|
+
kernel_constraint=self._kernel_constraint,
|
282
|
+
bias_constraint=self._bias_constraint,
|
283
|
+
)
|
284
|
+
if self._src_block_size is not None:
|
285
|
+
attention_layer_kwargs.update(
|
286
|
+
src_block_size=self._src_block_size,
|
287
|
+
tgt_block_size=self._tgt_block_size,
|
288
|
+
name="block_sparse_attention",
|
289
|
+
)
|
290
|
+
attention_fn = block_sparse_attention.MultiHeadAttention
|
291
|
+
elif self._num_kv_heads is not None:
|
292
|
+
attention_layer_kwargs.update(
|
293
|
+
num_kv_heads=self._num_kv_heads,
|
294
|
+
name="multi_query_attention",
|
295
|
+
)
|
296
|
+
attention_fn = multi_query_attention.MultiHeadAttention
|
297
|
+
else:
|
298
|
+
attention_fn = tf_keras.layers.MultiHeadAttention
|
299
|
+
self._attention_layer = attention_fn(
|
300
|
+
**attention_layer_kwargs, **common_kwargs
|
264
301
|
)
|
265
302
|
self._attention_dropout = tf_keras.layers.Dropout(
|
266
303
|
rate=self._attention_dropout_rate
|
@@ -373,6 +410,9 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
|
|
373
410
|
"value_dim": self._value_dim,
|
374
411
|
"output_last_dim": self._output_last_dim,
|
375
412
|
"diff_q_kv_att_layer_norm": self._diff_q_kv_att_layer_norm,
|
413
|
+
"num_kv_heads": self._num_kv_heads,
|
414
|
+
"src_block_size": self._src_block_size,
|
415
|
+
"tgt_block_size": self._tgt_block_size,
|
376
416
|
}
|
377
417
|
base_config = super().get_config()
|
378
418
|
return dict(list(base_config.items()) + list(config.items()))
|
@@ -712,6 +712,84 @@ class TransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
|
|
712
712
|
self.assertEqual(output_tensor.shape.as_list(),
|
713
713
|
expected_layer_output_shape)
|
714
714
|
|
715
|
+
@parameterized.named_parameters(
|
716
|
+
('mqa', 1),
|
717
|
+
('gqa', 4),
|
718
|
+
)
|
719
|
+
def test_attention_with_kv_heads(self, num_kv_heads):
|
720
|
+
num_attention_heads = 8
|
721
|
+
sequence_length = 21
|
722
|
+
width = 80
|
723
|
+
|
724
|
+
test_layer = TransformerEncoderBlock(
|
725
|
+
num_attention_heads=num_attention_heads,
|
726
|
+
inner_dim=2048,
|
727
|
+
inner_activation='relu',
|
728
|
+
return_attention_scores=True,
|
729
|
+
num_kv_heads=num_kv_heads,
|
730
|
+
)
|
731
|
+
# Create a 3-dimensional input (the first dimension is implicit).
|
732
|
+
data_tensor = tf_keras.Input(shape=(sequence_length, width))
|
733
|
+
output_tensor = test_layer(data_tensor)
|
734
|
+
|
735
|
+
expected_layer_output_shape = [None, sequence_length, width]
|
736
|
+
expected_attention_scores_shape = [
|
737
|
+
None,
|
738
|
+
num_attention_heads,
|
739
|
+
sequence_length,
|
740
|
+
sequence_length,
|
741
|
+
]
|
742
|
+
|
743
|
+
self.assertIsInstance(output_tensor, tuple)
|
744
|
+
self.assertLen(output_tensor, 2)
|
745
|
+
# First is the standard output.
|
746
|
+
self.assertEqual(
|
747
|
+
output_tensor[0].shape.as_list(), expected_layer_output_shape
|
748
|
+
)
|
749
|
+
# Second is the attention scores.
|
750
|
+
self.assertEqual(
|
751
|
+
output_tensor[1].shape.as_list(), expected_attention_scores_shape
|
752
|
+
)
|
753
|
+
|
754
|
+
def test_block_sparse_attention(self):
|
755
|
+
num_attention_heads = 8
|
756
|
+
sequence_length = 21
|
757
|
+
width = 80
|
758
|
+
src_block_size = 7
|
759
|
+
tgt_block_size = 7
|
760
|
+
|
761
|
+
test_layer = TransformerEncoderBlock(
|
762
|
+
num_attention_heads=num_attention_heads,
|
763
|
+
inner_dim=2048,
|
764
|
+
inner_activation='relu',
|
765
|
+
return_attention_scores=True,
|
766
|
+
src_block_size=src_block_size,
|
767
|
+
tgt_block_size=tgt_block_size,
|
768
|
+
)
|
769
|
+
# Create a 3-dimensional input (the first dimension is implicit).
|
770
|
+
data_tensor = tf_keras.Input(shape=(sequence_length, width))
|
771
|
+
output_tensor = test_layer(data_tensor)
|
772
|
+
|
773
|
+
expected_layer_output_shape = [None, sequence_length, width]
|
774
|
+
expected_attention_scores_shape = [
|
775
|
+
None,
|
776
|
+
num_attention_heads,
|
777
|
+
sequence_length//src_block_size,
|
778
|
+
src_block_size,
|
779
|
+
tgt_block_size,
|
780
|
+
]
|
781
|
+
|
782
|
+
self.assertIsInstance(output_tensor, tuple)
|
783
|
+
self.assertLen(output_tensor, 2)
|
784
|
+
# First is the standard output.
|
785
|
+
self.assertEqual(
|
786
|
+
output_tensor[0].shape.as_list(), expected_layer_output_shape
|
787
|
+
)
|
788
|
+
# Second is the attention scores.
|
789
|
+
self.assertEqual(
|
790
|
+
output_tensor[1].shape.as_list(), expected_attention_scores_shape
|
791
|
+
)
|
792
|
+
|
715
793
|
|
716
794
|
if __name__ == '__main__':
|
717
795
|
tf.test.main()
|
@@ -298,7 +298,7 @@ official/nlp/metrics/__init__.py,sha256=7oiypy0N82PDw9aSdcJBLVoGTd_oRSUOdvuJhMv4
|
|
298
298
|
official/nlp/metrics/bleu.py,sha256=XOTTbjC3B9lt8-MLvNX02tjA94wfsUVse6KJ5CWPzfk,6587
|
299
299
|
official/nlp/metrics/bleu_test.py,sha256=0j4pZ1MSIcndvUNZa25oXCu4UFOE367KaL7oRNCzLCI,2508
|
300
300
|
official/nlp/modeling/__init__.py,sha256=SQozaRl78tYS6xvGCfM3msABe2VL20x_mL2vIln1Sn0,1062
|
301
|
-
official/nlp/modeling/layers/__init__.py,sha256=
|
301
|
+
official/nlp/modeling/layers/__init__.py,sha256=vsVNp7WcO4o500l7Zq_-_BIqYbK4fKRMEEtCRJCSP2E,5076
|
302
302
|
official/nlp/modeling/layers/attention.py,sha256=3-jG3m_L9Y41BY35c4uTFG_Ywlfk4SOwUEtmqfSoKkk,3906
|
303
303
|
official/nlp/modeling/layers/attention_test.py,sha256=c7KezuYUze8PWAPuwYow8KTQNRyuuJgwICSsFTyJ2nQ,3536
|
304
304
|
official/nlp/modeling/layers/bigbird_attention.py,sha256=dzutgRoQt2DFsYMpMILv_QF0O_FMDbiLQ3T-7c1Zpcs,21111
|
@@ -335,7 +335,7 @@ official/nlp/modeling/layers/multi_query_attention.py,sha256=fFPBa9IBVj_O5x5OfGu
|
|
335
335
|
official/nlp/modeling/layers/multi_query_attention_test.py,sha256=3VFF2hz85YExWPwdbhYWaSrIaSOkC1x7axdGfXr0W90,8512
|
336
336
|
official/nlp/modeling/layers/on_device_embedding.py,sha256=FgsHyRXf5TWVTyo4OeKImmrTnn4uOPJgS3AGKzKMWYY,4582
|
337
337
|
official/nlp/modeling/layers/on_device_embedding_test.py,sha256=M-LUba4QXV37s9Cx7aH8LL3bz_YotC6qITmWRI7Fhjk,8589
|
338
|
-
official/nlp/modeling/layers/pack_optimization.py,sha256=
|
338
|
+
official/nlp/modeling/layers/pack_optimization.py,sha256=7bQS9k5Pd9X08KyI6-Px1t8tWYG719Bgaxqqs6IXSDo,10760
|
339
339
|
official/nlp/modeling/layers/pack_optimization_test.py,sha256=dpsyZAI_PNq9C5HkOkCk70hWaSbT0UThSclwQeYDQqU,2795
|
340
340
|
official/nlp/modeling/layers/per_dim_scale_attention.py,sha256=1xECNMAB91lz7eVl6FevwRrHXaHW3-FCpjXTO8F3S4M,3416
|
341
341
|
official/nlp/modeling/layers/per_dim_scale_attention_test.py,sha256=_JbPV0ALqFSCWYBvmuemeN4ist0AnNPbQLgwVsRvavU,1761
|
@@ -347,8 +347,8 @@ official/nlp/modeling/layers/reuse_attention.py,sha256=qvAC-Dr2uPbpQWOvaf0RVN7t6
|
|
347
347
|
official/nlp/modeling/layers/reuse_attention_test.py,sha256=rKr-dl05DqQesYdvYaCYYahIp0ObP4Xgi4Lno4jsl3Y,14329
|
348
348
|
official/nlp/modeling/layers/reuse_transformer.py,sha256=S0IxI8LzjAnZ5L3MDy32oanI3oGQhxQjdkC3ff-zlmc,15697
|
349
349
|
official/nlp/modeling/layers/reuse_transformer_test.py,sha256=GXuJWfNrqsOwxAi0xSyuziD3kreVWGPCr0LHmxxe0Mk,17201
|
350
|
-
official/nlp/modeling/layers/rezero_transformer.py,sha256=
|
351
|
-
official/nlp/modeling/layers/rezero_transformer_test.py,sha256
|
350
|
+
official/nlp/modeling/layers/rezero_transformer.py,sha256=SE6iDIlguTxEBdKR79XGOZaqHJ79UY5VZuolgQXQz8g,14249
|
351
|
+
official/nlp/modeling/layers/rezero_transformer_test.py,sha256=-Ib-PpZvWgP5aiH3EhyY69OxXLldxCMsnT8oAhojXzI,8620
|
352
352
|
official/nlp/modeling/layers/routing.py,sha256=hV6RHVBU0lEgwx180Q78mDncuXTcyWPyaVBqatWCtQw,4469
|
353
353
|
official/nlp/modeling/layers/routing_test.py,sha256=ViRCnFWPdwM4Kam0k8aDZbyoJqeqyIiQKEmlhNE7LgI,2226
|
354
354
|
official/nlp/modeling/layers/self_attention_mask.py,sha256=7avqkfChwnuZU-qqAED0x1gwwmWSMUszZVAIch8NF_Y,2173
|
@@ -363,8 +363,8 @@ official/nlp/modeling/layers/tn_expand_condense_test.py,sha256=J52mXzoiuaXfR61kh
|
|
363
363
|
official/nlp/modeling/layers/tn_transformer_expand_condense.py,sha256=gbGJOrgxJd1SyMGB6ME04FSxuZfHqsi94Xxt23l7368,11032
|
364
364
|
official/nlp/modeling/layers/tn_transformer_test.py,sha256=Fh-EDRoAkhO7ccD3w3FsJHC51MnZySv8jBlHYnvKZMc,8893
|
365
365
|
official/nlp/modeling/layers/transformer.py,sha256=yofIEOjZpcvDmHbcjBmkZrl5iSe6pLtMsetNbXmxDnY,20087
|
366
|
-
official/nlp/modeling/layers/transformer_encoder_block.py,sha256=
|
367
|
-
official/nlp/modeling/layers/transformer_encoder_block_test.py,sha256=
|
366
|
+
official/nlp/modeling/layers/transformer_encoder_block.py,sha256=9EuAsedY35eIFc4z-22QQ4c47NHrEe8-8uzjtPfgNTM,21977
|
367
|
+
official/nlp/modeling/layers/transformer_encoder_block_test.py,sha256=chs8-M69Gx_Zcp7Pi7sNKjpWgyuSHDw_fNrRh6URPLc,30686
|
368
368
|
official/nlp/modeling/layers/transformer_scaffold.py,sha256=m8TF4geBkm8-VJQiTpzMI6FSJZry6oa2vPO3FXCCClE,15704
|
369
369
|
official/nlp/modeling/layers/transformer_scaffold_test.py,sha256=pqUGldhmAKROrd4eoCWmHNtKOdCO6PH_-EigcYnvIpE,19920
|
370
370
|
official/nlp/modeling/layers/transformer_test.py,sha256=kC_9NcLbJnBbuTaE_7BW60EF8xG_QUoICj0t0gS7O4Q,5522
|
@@ -1212,9 +1212,9 @@ tensorflow_models/tensorflow_models_test.py,sha256=nc6A9K53OGqF25xN5St8EiWvdVbda
|
|
1212
1212
|
tensorflow_models/nlp/__init__.py,sha256=4tA5Pf4qaFwT-fIFOpX7x7FHJpnyJT-5UgOeFYTyMlc,807
|
1213
1213
|
tensorflow_models/uplift/__init__.py,sha256=mqfa55gweOdpKoaQyid4A_4u7xw__FcQeSIF0k_pYmI,999
|
1214
1214
|
tensorflow_models/vision/__init__.py,sha256=zBorY_v5xva1uI-qxhZO3Qh-Dii-Suq6wEYh6hKHDfc,833
|
1215
|
-
tf_models_nightly-2.17.0.
|
1216
|
-
tf_models_nightly-2.17.0.
|
1217
|
-
tf_models_nightly-2.17.0.
|
1218
|
-
tf_models_nightly-2.17.0.
|
1219
|
-
tf_models_nightly-2.17.0.
|
1220
|
-
tf_models_nightly-2.17.0.
|
1215
|
+
tf_models_nightly-2.17.0.dev20240612.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
|
1216
|
+
tf_models_nightly-2.17.0.dev20240612.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
|
1217
|
+
tf_models_nightly-2.17.0.dev20240612.dist-info/METADATA,sha256=L8f-eOf_Ygx6KUBQjrmxw7M6NOWla-lHBmqARj5U4vg,1432
|
1218
|
+
tf_models_nightly-2.17.0.dev20240612.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
|
1219
|
+
tf_models_nightly-2.17.0.dev20240612.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
|
1220
|
+
tf_models_nightly-2.17.0.dev20240612.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|