tf-models-nightly 2.18.0.dev20240912__py2.py3-none-any.whl → 2.18.0.dev20240914__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,14 +14,42 @@
14
14
 
15
15
  """Block sparse attention converts query/key/value into blocks and performs diagonal block sparse attention."""
16
16
  import collections
17
+ import logging
17
18
 
18
19
  import tensorflow as tf, tf_keras
19
20
 
20
21
 
22
+ def _large_compatible_negative(tensor_type):
23
+ """Large negative number as Tensor.
24
+
25
+ This function is necessary because the standard value for epsilon
26
+ in this module (-1e9) cannot be represented using tf.float16
27
+
28
+ Args:
29
+ tensor_type: a dtype to determine the type.
30
+
31
+ Returns:
32
+ a large negative number.
33
+ """
34
+ # In case of dtype=float16 (e.g., for mixed-precision), the largest
35
+ # negative number (dtypes.float16.min) is divided by 2, in order to
36
+ # avoid overflows when summing negative inputs.
37
+ if tensor_type == tf.float16:
38
+ return tf.float16.min / 2.0
39
+ return -1e9
40
+
41
+
21
42
  class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
22
43
  """Multi-head block sparse attention layer."""
23
44
 
24
- def __init__(self, src_block_size=None, tgt_block_size=None, **kwargs):
45
+ def __init__(
46
+ self,
47
+ src_block_size=None,
48
+ tgt_block_size=None,
49
+ use_sigmoid_attn=False,
50
+ sigmoid_attn_bias=None,
51
+ **kwargs
52
+ ):
25
53
  """Initializes the block sparse attention layer.
26
54
 
27
55
  Args:
@@ -30,6 +58,9 @@ class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
30
58
  tgt_block_size: The block size of the key/value. An integer that divides
31
59
  the sequence length into blocks. The number of blocks in the source and
32
60
  target must be the same.
61
+ use_sigmoid_attn: If enabled, uses sigmoid instead of softmax to compute
62
+ attn probs. https://arxiv.org/pdf/2409.04431
63
+ sigmoid_attn_bias: Bias for sigmoid attn. Suggested value -ln(seq_len).
33
64
  **kwargs: Args passed to the base class.
34
65
  """
35
66
  super().__init__(**kwargs)
@@ -37,11 +68,24 @@ class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
37
68
  raise ValueError("src_block_size must be specified.")
38
69
  self._src_block_size = src_block_size
39
70
  self._tgt_block_size = tgt_block_size or self._src_block_size
71
+ self._use_sigmoid_attn = use_sigmoid_attn
72
+ self._sigmoid_attn_bias = sigmoid_attn_bias
73
+ if self._use_sigmoid_attn:
74
+ if self._sigmoid_attn_bias is None:
75
+ raise ValueError(
76
+ "sigmoid_attn_bias must be specified for sigmoid attn."
77
+ )
40
78
 
41
79
  def _build_from_signature(self, query, value, key=None):
42
80
  # pytype: disable=attribute-error
43
81
  super()._build_from_signature(query, value, key)
44
82
  # pytype: enable=attribute-error
83
+ # If block sizes are same as sequence lengths, we defer to default attn.
84
+ if (
85
+ self._query_shape[-2] == self._src_block_size
86
+ and self._key_shape[-2] == self._tgt_block_size
87
+ ):
88
+ return
45
89
  # The following capital letters are used to denote the tensor dimension
46
90
  # parameters:
47
91
  # B = batch size
@@ -127,11 +171,38 @@ class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
127
171
  if attention_mask is not None:
128
172
  # `attention_mask` = [B, 1, L, T, S]
129
173
  attention_mask = tf.expand_dims(attention_mask, axis=1)
130
- return self._softmax(attention_scores, attention_mask)
174
+ if self._use_sigmoid_attn:
175
+ if attention_mask is not None:
176
+ adder = (1.0 - tf.cast(attention_mask, attention_scores.dtype)) * (
177
+ _large_compatible_negative(attention_scores.dtype)
178
+ )
179
+ attention_scores += adder
180
+ attention_scores += self._sigmoid_attn_bias
181
+ return tf_keras.activations.sigmoid(attention_scores)
182
+ else:
183
+ return self._softmax(attention_scores, attention_mask)
131
184
 
132
185
  def _compute_attention(
133
186
  self, query, key, value, attention_mask=None, training=None
134
187
  ):
188
+ # If block sizes are same as sequence lengths, we defer to default attn.
189
+ if (
190
+ self._query_shape[-2] == self._src_block_size
191
+ and self._key_shape[-2] == self._tgt_block_size
192
+ ):
193
+ logging.info(
194
+ "Computing default attention as block sizes are equal to sequence"
195
+ " lengths."
196
+ )
197
+ # pytype: disable=attribute-error
198
+ return super()._compute_attention(
199
+ query,
200
+ key,
201
+ value,
202
+ attention_mask=attention_mask,
203
+ training=training,
204
+ )
205
+ # pytype: enable=attribute-error
135
206
  # src_num_blocks and tgt_num_blocks are the number of blocks in the source
136
207
  # and target. Care should be taken to ensure that the number of blocks in
137
208
  # the source and target are the same.
@@ -14,6 +14,8 @@
14
14
 
15
15
  """Tests for block sparse attention layer."""
16
16
 
17
+ import math
18
+
17
19
  from absl.testing import parameterized
18
20
  import numpy as np
19
21
  import tensorflow as tf, tf_keras
@@ -53,12 +55,29 @@ class BlockSparseAttentionTest(tf.test.TestCase, parameterized.TestCase):
53
55
  output = test_layer(query, query)
54
56
  self.assertEqual(output.shape.as_list(), [None, 40, 80])
55
57
 
56
- @parameterized.named_parameters(("with_bias", True), ("no_bias", False))
57
- def test_masked_attention(self, use_bias):
58
+ @parameterized.named_parameters(
59
+ ("with_bias", True),
60
+ ("no_bias", False),
61
+ ("with_sigmoid_attn", True, True),
62
+ )
63
+ def test_masked_attention(
64
+ self,
65
+ use_bias,
66
+ use_sigmoid_attn=False,
67
+ ):
58
68
  """Test with a mask tensor."""
69
+ if use_sigmoid_attn:
70
+ sigmoid_attn_bias = -math.log(2)
71
+ else:
72
+ sigmoid_attn_bias = None
59
73
  test_layer = block_sparse_attention.MultiHeadAttention(
60
- num_heads=4, key_dim=2, use_bias=use_bias, src_block_size=2,
74
+ num_heads=4,
75
+ key_dim=2,
76
+ use_bias=use_bias,
77
+ src_block_size=2,
61
78
  tgt_block_size=1,
79
+ use_sigmoid_attn=use_sigmoid_attn,
80
+ sigmoid_attn_bias=sigmoid_attn_bias,
62
81
  )
63
82
  # Create a 3-dimensional input (the first dimension is implicit).
64
83
  batch_size = 3
@@ -112,6 +131,77 @@ class BlockSparseAttentionTest(tf.test.TestCase, parameterized.TestCase):
112
131
  self.assertLen(test_layer._query_dense.trainable_variables, 1)
113
132
  self.assertLen(test_layer._output_dense.trainable_variables, 1)
114
133
 
134
+ @parameterized.named_parameters(
135
+ ("default_with_softmax", False),
136
+ ("default_with_sigmoid", True),
137
+ )
138
+ def test_default_masked_attention(
139
+ self,
140
+ use_sigmoid_attn=False,
141
+ ):
142
+ """Test with a mask tensor."""
143
+ seq_len = 8
144
+ if use_sigmoid_attn:
145
+ sigmoid_attn_bias = -math.log(seq_len)
146
+ else:
147
+ sigmoid_attn_bias = None
148
+ test_layer = block_sparse_attention.MultiHeadAttention(
149
+ num_heads=4,
150
+ key_dim=2,
151
+ use_bias=True,
152
+ src_block_size=seq_len,
153
+ tgt_block_size=seq_len,
154
+ use_sigmoid_attn=use_sigmoid_attn,
155
+ sigmoid_attn_bias=sigmoid_attn_bias,
156
+ )
157
+ # Create a 3-dimensional input (the first dimension is implicit).
158
+ batch_size = 3
159
+ query = tf_keras.Input(shape=(seq_len, 8))
160
+ value = tf_keras.Input(shape=(seq_len, 8))
161
+ mask_tensor = tf_keras.Input(shape=(seq_len, seq_len))
162
+ output = test_layer(query=query, value=value, attention_mask=mask_tensor)
163
+
164
+ # Create a model containing the test layer.
165
+ model = tf_keras.Model([query, value, mask_tensor], output)
166
+
167
+ # Generate data for the input (non-mask) tensors.
168
+ from_data = 10 * np.random.random_sample((batch_size, seq_len, 8))
169
+ to_data = 10 * np.random.random_sample((batch_size, seq_len, 8))
170
+
171
+ # Invoke the data with a random set of mask data. This should mask at
172
+ # least one element.
173
+ mask_data = np.random.randint(2, size=(batch_size, seq_len, seq_len))
174
+ masked_output_data = model.predict([from_data, to_data, mask_data])
175
+
176
+ # Invoke the same data, but with a null mask (where no elements are
177
+ # masked).
178
+ null_mask_data = np.ones((batch_size, seq_len, seq_len))
179
+ unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
180
+
181
+ # Because one data is masked and one is not, the outputs should not be
182
+ # the same.
183
+ self.assertNotAllClose(masked_output_data, unmasked_output_data)
184
+
185
+ # Tests the layer with three inputs: Q, K, V.
186
+ key = tf_keras.Input(shape=(seq_len, 8))
187
+ output = test_layer(
188
+ query, value=value, key=key, attention_mask=mask_tensor
189
+ )
190
+ model = tf_keras.Model([query, value, key, mask_tensor], output)
191
+
192
+ masked_output_data = model.predict(
193
+ [from_data, to_data, to_data, mask_data]
194
+ )
195
+ unmasked_output_data = model.predict(
196
+ [from_data, to_data, to_data, null_mask_data]
197
+ )
198
+ # Because one data is masked and one is not, the outputs should not be
199
+ # the same.
200
+ self.assertNotAllClose(masked_output_data, unmasked_output_data)
201
+
202
+ self.assertLen(test_layer._query_dense.trainable_variables, 2)
203
+ self.assertLen(test_layer._output_dense.trainable_variables, 2)
204
+
115
205
  def test_masked_attention_with_scores(self):
116
206
  """Test with a mask tensor."""
117
207
  test_layer = block_sparse_attention.MultiHeadAttention(
@@ -112,6 +112,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
112
112
  num_kv_heads=None,
113
113
  src_block_size=None,
114
114
  tgt_block_size=None,
115
+ use_sigmoid_attn=False,
116
+ sigmoid_attn_bias=None,
115
117
  **kwargs):
116
118
  """Initializes `TransformerEncoderBlock`.
117
119
 
@@ -185,6 +187,10 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
185
187
  `block_sparse_attention.MultiHeadAttention` for more details.
186
188
  tgt_block_size: Target block size. Refer to
187
189
  `block_sparse_attention.MultiHeadAttention` for more details.
190
+ use_sigmoid_attn: This param is only used in
191
+ `block_sparse_attention.MultiHeadAttention`
192
+ sigmoid_attn_bias: This param is only used in
193
+ `block_sparse_attention.MultiHeadAttention`
188
194
  **kwargs: keyword arguments.
189
195
  """
190
196
  util.filter_kwargs(kwargs)
@@ -222,6 +228,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
222
228
  self._num_kv_heads = num_kv_heads
223
229
  self._src_block_size = src_block_size
224
230
  self._tgt_block_size = tgt_block_size
231
+ self._use_sigmoid_attn = use_sigmoid_attn
232
+ self._sigmoid_attn_bias = sigmoid_attn_bias
225
233
  if self._num_kv_heads is not None and self._src_block_size is not None:
226
234
  raise ValueError(
227
235
  "Block sparse attention does not support Multi-query attention."
@@ -285,6 +293,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
285
293
  attention_layer_kwargs.update(
286
294
  src_block_size=self._src_block_size,
287
295
  tgt_block_size=self._tgt_block_size,
296
+ use_sigmoid_attn=self._use_sigmoid_attn,
297
+ sigmoid_attn_bias=self._sigmoid_attn_bias,
288
298
  name="block_sparse_attention",
289
299
  )
290
300
  attention_fn = block_sparse_attention.MultiHeadAttention
@@ -413,6 +423,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
413
423
  "num_kv_heads": self._num_kv_heads,
414
424
  "src_block_size": self._src_block_size,
415
425
  "tgt_block_size": self._tgt_block_size,
426
+ "use_sigmoid_attn": self._use_sigmoid_attn,
427
+ "sigmoid_attn_bias": self._sigmoid_attn_bias,
416
428
  }
417
429
  base_config = super().get_config()
418
430
  return dict(list(base_config.items()) + list(config.items()))
@@ -14,6 +14,8 @@
14
14
 
15
15
  """Tests for Keras-based transformer block layer."""
16
16
 
17
+ import math
18
+
17
19
  from absl.testing import parameterized
18
20
  import numpy as np
19
21
  import tensorflow as tf, tf_keras
@@ -751,7 +753,11 @@ class TransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
751
753
  output_tensor[1].shape.as_list(), expected_attention_scores_shape
752
754
  )
753
755
 
754
- def test_block_sparse_attention(self):
756
+ @parameterized.named_parameters(
757
+ ('use_softmax_attn', False),
758
+ ('use_sigmoid_attn', True),
759
+ )
760
+ def test_block_sparse_attention(self, use_sigmoid_attn):
755
761
  num_attention_heads = 8
756
762
  sequence_length = 21
757
763
  width = 80
@@ -765,6 +771,10 @@ class TransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
765
771
  return_attention_scores=True,
766
772
  src_block_size=src_block_size,
767
773
  tgt_block_size=tgt_block_size,
774
+ use_sigmoid_attn=use_sigmoid_attn,
775
+ sigmoid_attn_bias=-math.log(sequence_length)
776
+ if use_sigmoid_attn
777
+ else None,
768
778
  )
769
779
  # Create a 3-dimensional input (the first dimension is implicit).
770
780
  data_tensor = tf_keras.Input(shape=(sequence_length, width))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tf-models-nightly
3
- Version: 2.18.0.dev20240912
3
+ Version: 2.18.0.dev20240914
4
4
  Summary: TensorFlow Official Models
5
5
  Home-page: https://github.com/tensorflow/models
6
6
  Author: Google Inc.
@@ -305,8 +305,8 @@ official/nlp/modeling/layers/bigbird_attention.py,sha256=dzutgRoQt2DFsYMpMILv_QF
305
305
  official/nlp/modeling/layers/bigbird_attention_test.py,sha256=cBYwK5k1rnykZ0gif-n7VaByLIoElA-N0_svCRKASoU,2206
306
306
  official/nlp/modeling/layers/block_diag_feedforward.py,sha256=FDEt-J_QjOxwar3eT5yjMs4hR41Ppke1zj7iswsZR4M,7243
307
307
  official/nlp/modeling/layers/block_diag_feedforward_test.py,sha256=wcg8In6FIOCxcKqe5rucftjJ_kUWTi9Ei7eEmlVCYpE,4181
308
- official/nlp/modeling/layers/block_sparse_attention.py,sha256=Vjy0JULOb9u6-EzD460kXCotsibqyD29imlmrb7aVSY,7580
309
- official/nlp/modeling/layers/block_sparse_attention_test.py,sha256=YF2_-I27INUFtu-WP7s7C1kpYmsobNIGOWM1iUvSD5Y,12041
308
+ official/nlp/modeling/layers/block_sparse_attention.py,sha256=eY6jkSI-TrnL0JkP_9B-0DCxzppZdK_c8qp6Uw6yiD0,9923
309
+ official/nlp/modeling/layers/block_sparse_attention_test.py,sha256=KSQENNhRG7Y1qDpdW_O3Ws6nPC4se7zv1UcxF2o7blI,15037
310
310
  official/nlp/modeling/layers/cls_head.py,sha256=0X_gdjnAt6TZVrH_xkDcQCpwLuVz5Pb7d04wEVN_Kn8,16208
311
311
  official/nlp/modeling/layers/cls_head_test.py,sha256=01oMmiuyp1lDEXBYa9r3krn6BtH-QuSedGOca9LViEc,8888
312
312
  official/nlp/modeling/layers/factorized_embedding.py,sha256=4oFRYJbpoaSxqv8hTWY2JPGPllp-zhniz99IyRtlzV8,2902
@@ -363,8 +363,8 @@ official/nlp/modeling/layers/tn_expand_condense_test.py,sha256=J52mXzoiuaXfR61kh
363
363
  official/nlp/modeling/layers/tn_transformer_expand_condense.py,sha256=gbGJOrgxJd1SyMGB6ME04FSxuZfHqsi94Xxt23l7368,11032
364
364
  official/nlp/modeling/layers/tn_transformer_test.py,sha256=Fh-EDRoAkhO7ccD3w3FsJHC51MnZySv8jBlHYnvKZMc,8893
365
365
  official/nlp/modeling/layers/transformer.py,sha256=yofIEOjZpcvDmHbcjBmkZrl5iSe6pLtMsetNbXmxDnY,20087
366
- official/nlp/modeling/layers/transformer_encoder_block.py,sha256=9EuAsedY35eIFc4z-22QQ4c47NHrEe8-8uzjtPfgNTM,21977
367
- official/nlp/modeling/layers/transformer_encoder_block_test.py,sha256=chs8-M69Gx_Zcp7Pi7sNKjpWgyuSHDw_fNrRh6URPLc,30686
366
+ official/nlp/modeling/layers/transformer_encoder_block.py,sha256=n7_HgFjCye7ZNxzQ67CtgboDKPIE-28796Y2aW8Zk_U,22566
367
+ official/nlp/modeling/layers/transformer_encoder_block_test.py,sha256=5B_h8iNweUiRJR2IH1zxFelsfhVPEJJ4dEzL_pHPjI0,30968
368
368
  official/nlp/modeling/layers/transformer_scaffold.py,sha256=m8TF4geBkm8-VJQiTpzMI6FSJZry6oa2vPO3FXCCClE,15704
369
369
  official/nlp/modeling/layers/transformer_scaffold_test.py,sha256=pqUGldhmAKROrd4eoCWmHNtKOdCO6PH_-EigcYnvIpE,19920
370
370
  official/nlp/modeling/layers/transformer_test.py,sha256=kC_9NcLbJnBbuTaE_7BW60EF8xG_QUoICj0t0gS7O4Q,5522
@@ -1222,9 +1222,9 @@ tensorflow_models/tensorflow_models_test.py,sha256=nc6A9K53OGqF25xN5St8EiWvdVbda
1222
1222
  tensorflow_models/nlp/__init__.py,sha256=4tA5Pf4qaFwT-fIFOpX7x7FHJpnyJT-5UgOeFYTyMlc,807
1223
1223
  tensorflow_models/uplift/__init__.py,sha256=mqfa55gweOdpKoaQyid4A_4u7xw__FcQeSIF0k_pYmI,999
1224
1224
  tensorflow_models/vision/__init__.py,sha256=zBorY_v5xva1uI-qxhZO3Qh-Dii-Suq6wEYh6hKHDfc,833
1225
- tf_models_nightly-2.18.0.dev20240912.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
1226
- tf_models_nightly-2.18.0.dev20240912.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
1227
- tf_models_nightly-2.18.0.dev20240912.dist-info/METADATA,sha256=DSdv3ZNz6oi2xj_9C5HPYHoh0h1dxv8tz_LER44-4Ms,1432
1228
- tf_models_nightly-2.18.0.dev20240912.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
1229
- tf_models_nightly-2.18.0.dev20240912.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
1230
- tf_models_nightly-2.18.0.dev20240912.dist-info/RECORD,,
1225
+ tf_models_nightly-2.18.0.dev20240914.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
1226
+ tf_models_nightly-2.18.0.dev20240914.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
1227
+ tf_models_nightly-2.18.0.dev20240914.dist-info/METADATA,sha256=tcenYrqEbPZvYOYLGuM1b4NLtuby7duThrpqua_darc,1432
1228
+ tf_models_nightly-2.18.0.dev20240914.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
1229
+ tf_models_nightly-2.18.0.dev20240914.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
1230
+ tf_models_nightly-2.18.0.dev20240914.dist-info/RECORD,,