PyPI - tf-models-nightly - Versions diffs - 2.18.0.dev20240912__py2.py3-none-any.whl → 2.18.0.dev20240914__py2.py3-none-any.whl - Mend

tf-models-nightly 2.18.0.dev20240912py2.py3-none-any.whl → 2.18.0.dev20240914py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

official/nlp/modeling/layers/block_sparse_attention.py CHANGED Viewed

@@ -14,14 +14,42 @@
 """Block sparse attention converts query/key/value into blocks and performs diagonal block sparse attention."""
 import collections
+import logging
 import tensorflow as tf, tf_keras
+def _large_compatible_negative(tensor_type):
+  """Large negative number as Tensor.
+  This function is necessary because the standard value for epsilon
+  in this module (-1e9) cannot be represented using tf.float16
+  Args:
+      tensor_type: a dtype to determine the type.
+  Returns:
+      a large negative number.
+  """
+  # In case of dtype=float16 (e.g., for mixed-precision), the largest
+  # negative number (dtypes.float16.min) is divided by 2, in order to
+  # avoid overflows when summing negative inputs.
+  if tensor_type == tf.float16:
+    return tf.float16.min / 2.0
+  return -1e9
 class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
   """Multi-head block sparse attention layer."""
-  def __init__(self, src_block_size=None, tgt_block_size=None, **kwargs):
+  def __init__(
+      self,
+      src_block_size=None,
+      tgt_block_size=None,
+      use_sigmoid_attn=False,
+      sigmoid_attn_bias=None,
+      **kwargs
+  ):
     """Initializes the block sparse attention layer.
     Args:
@@ -30,6 +58,9 @@ class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
       tgt_block_size: The block size of the key/value. An integer that divides
         the sequence length into blocks. The number of blocks in the source and
         target must be the same.
+      use_sigmoid_attn: If enabled, uses sigmoid instead of softmax to compute
+        attn probs. https://arxiv.org/pdf/2409.04431
+      sigmoid_attn_bias: Bias for sigmoid attn. Suggested value -ln(seq_len).
       **kwargs: Args passed to the base class.
     """
     super().__init__(**kwargs)
@@ -37,11 +68,24 @@ class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
       raise ValueError("src_block_size must be specified.")
     self._src_block_size = src_block_size
     self._tgt_block_size = tgt_block_size or self._src_block_size
+    self._use_sigmoid_attn = use_sigmoid_attn
+    self._sigmoid_attn_bias = sigmoid_attn_bias
+    if self._use_sigmoid_attn:
+      if self._sigmoid_attn_bias is None:
+        raise ValueError(
+            "sigmoid_attn_bias must be specified for sigmoid attn."
+        )
   def _build_from_signature(self, query, value, key=None):
     # pytype: disable=attribute-error
     super()._build_from_signature(query, value, key)
     # pytype: enable=attribute-error
+    # If block sizes are same as sequence lengths, we defer to default attn.
+    if (
+        self._query_shape[-2] == self._src_block_size
+        and self._key_shape[-2] == self._tgt_block_size
+    ):
+      return
     # The following capital letters are used to denote the tensor dimension
     # parameters:
     # B = batch size
@@ -127,11 +171,38 @@ class MultiHeadAttention(tf_keras.layers.MultiHeadAttention):
     if attention_mask is not None:
       # `attention_mask` = [B, 1, L, T, S]
       attention_mask = tf.expand_dims(attention_mask, axis=1)
-    return self._softmax(attention_scores, attention_mask)
+    if self._use_sigmoid_attn:
+      if attention_mask is not None:
+        adder = (1.0 - tf.cast(attention_mask, attention_scores.dtype)) * (
+            _large_compatible_negative(attention_scores.dtype)
+        )
+        attention_scores += adder
+      attention_scores += self._sigmoid_attn_bias
+      return tf_keras.activations.sigmoid(attention_scores)
+    else:
+      return self._softmax(attention_scores, attention_mask)
   def _compute_attention(
       self, query, key, value, attention_mask=None, training=None
   ):
+    # If block sizes are same as sequence lengths, we defer to default attn.
+    if (
+        self._query_shape[-2] == self._src_block_size
+        and self._key_shape[-2] == self._tgt_block_size
+    ):
+      logging.info(
+          "Computing default attention as block sizes are equal to sequence"
+          " lengths."
+      )
+      # pytype: disable=attribute-error
+      return super()._compute_attention(
+          query,
+          key,
+          value,
+          attention_mask=attention_mask,
+          training=training,
+      )
+      # pytype: enable=attribute-error
     # src_num_blocks and tgt_num_blocks are the number of blocks in the source
     # and target. Care should be taken to ensure that the number of blocks in
     # the source and target are the same.

official/nlp/modeling/layers/block_sparse_attention_test.py CHANGED Viewed

@@ -14,6 +14,8 @@
 """Tests for block sparse attention layer."""
+import math
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf, tf_keras
@@ -53,12 +55,29 @@ class BlockSparseAttentionTest(tf.test.TestCase, parameterized.TestCase):
     output = test_layer(query, query)
     self.assertEqual(output.shape.as_list(), [None, 40, 80])
-  @parameterized.named_parameters(("with_bias", True), ("no_bias", False))
-  def test_masked_attention(self, use_bias):
+  @parameterized.named_parameters(
+      ("with_bias", True),
+      ("no_bias", False),
+      ("with_sigmoid_attn", True, True),
+  )
+  def test_masked_attention(
+      self,
+      use_bias,
+      use_sigmoid_attn=False,
+  ):
     """Test with a mask tensor."""
+    if use_sigmoid_attn:
+      sigmoid_attn_bias = -math.log(2)
+    else:
+      sigmoid_attn_bias = None
     test_layer = block_sparse_attention.MultiHeadAttention(
-        num_heads=4, key_dim=2, use_bias=use_bias, src_block_size=2,
+        num_heads=4,
+        key_dim=2,
+        use_bias=use_bias,
+        src_block_size=2,
         tgt_block_size=1,
+        use_sigmoid_attn=use_sigmoid_attn,
+        sigmoid_attn_bias=sigmoid_attn_bias,
     )
     # Create a 3-dimensional input (the first dimension is implicit).
     batch_size = 3
@@ -112,6 +131,77 @@ class BlockSparseAttentionTest(tf.test.TestCase, parameterized.TestCase):
       self.assertLen(test_layer._query_dense.trainable_variables, 1)
       self.assertLen(test_layer._output_dense.trainable_variables, 1)
+  @parameterized.named_parameters(
+      ("default_with_softmax", False),
+      ("default_with_sigmoid", True),
+  )
+  def test_default_masked_attention(
+      self,
+      use_sigmoid_attn=False,
+  ):
+    """Test with a mask tensor."""
+    seq_len = 8
+    if use_sigmoid_attn:
+      sigmoid_attn_bias = -math.log(seq_len)
+    else:
+      sigmoid_attn_bias = None
+    test_layer = block_sparse_attention.MultiHeadAttention(
+        num_heads=4,
+        key_dim=2,
+        use_bias=True,
+        src_block_size=seq_len,
+        tgt_block_size=seq_len,
+        use_sigmoid_attn=use_sigmoid_attn,
+        sigmoid_attn_bias=sigmoid_attn_bias,
+    )
+    # Create a 3-dimensional input (the first dimension is implicit).
+    batch_size = 3
+    query = tf_keras.Input(shape=(seq_len, 8))
+    value = tf_keras.Input(shape=(seq_len, 8))
+    mask_tensor = tf_keras.Input(shape=(seq_len, seq_len))
+    output = test_layer(query=query, value=value, attention_mask=mask_tensor)
+    # Create a model containing the test layer.
+    model = tf_keras.Model([query, value, mask_tensor], output)
+    # Generate data for the input (non-mask) tensors.
+    from_data = 10 * np.random.random_sample((batch_size, seq_len, 8))
+    to_data = 10 * np.random.random_sample((batch_size, seq_len, 8))
+    # Invoke the data with a random set of mask data. This should mask at
+    # least one element.
+    mask_data = np.random.randint(2, size=(batch_size, seq_len, seq_len))
+    masked_output_data = model.predict([from_data, to_data, mask_data])
+    # Invoke the same data, but with a null mask (where no elements are
+    # masked).
+    null_mask_data = np.ones((batch_size, seq_len, seq_len))
+    unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
+    # Because one data is masked and one is not, the outputs should not be
+    # the same.
+    self.assertNotAllClose(masked_output_data, unmasked_output_data)
+    # Tests the layer with three inputs: Q, K, V.
+    key = tf_keras.Input(shape=(seq_len, 8))
+    output = test_layer(
+        query, value=value, key=key, attention_mask=mask_tensor
+    )
+    model = tf_keras.Model([query, value, key, mask_tensor], output)
+    masked_output_data = model.predict(
+        [from_data, to_data, to_data, mask_data]
+    )
+    unmasked_output_data = model.predict(
+        [from_data, to_data, to_data, null_mask_data]
+    )
+    # Because one data is masked and one is not, the outputs should not be
+    # the same.
+    self.assertNotAllClose(masked_output_data, unmasked_output_data)
+    self.assertLen(test_layer._query_dense.trainable_variables, 2)
+    self.assertLen(test_layer._output_dense.trainable_variables, 2)
   def test_masked_attention_with_scores(self):
     """Test with a mask tensor."""
     test_layer = block_sparse_attention.MultiHeadAttention(

official/nlp/modeling/layers/transformer_encoder_block.py CHANGED Viewed

@@ -112,6 +112,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
                num_kv_heads=None,
                src_block_size=None,
                tgt_block_size=None,
+               use_sigmoid_attn=False,
+               sigmoid_attn_bias=None,
                **kwargs):
     """Initializes `TransformerEncoderBlock`.
@@ -185,6 +187,10 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
         `block_sparse_attention.MultiHeadAttention` for more details.
       tgt_block_size: Target block size. Refer to
         `block_sparse_attention.MultiHeadAttention` for more details.
+      use_sigmoid_attn: This param is only used in
+        `block_sparse_attention.MultiHeadAttention`
+      sigmoid_attn_bias: This param is only used in
+        `block_sparse_attention.MultiHeadAttention`
       **kwargs: keyword arguments.
     """
     util.filter_kwargs(kwargs)
@@ -222,6 +228,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
     self._num_kv_heads = num_kv_heads
     self._src_block_size = src_block_size
     self._tgt_block_size = tgt_block_size
+    self._use_sigmoid_attn = use_sigmoid_attn
+    self._sigmoid_attn_bias = sigmoid_attn_bias
     if self._num_kv_heads is not None and self._src_block_size is not None:
       raise ValueError(
           "Block sparse attention does not support Multi-query attention."
@@ -285,6 +293,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
       attention_layer_kwargs.update(
           src_block_size=self._src_block_size,
           tgt_block_size=self._tgt_block_size,
+          use_sigmoid_attn=self._use_sigmoid_attn,
+          sigmoid_attn_bias=self._sigmoid_attn_bias,
           name="block_sparse_attention",
       )
       attention_fn = block_sparse_attention.MultiHeadAttention
@@ -413,6 +423,8 @@ class TransformerEncoderBlock(tf_keras.layers.Layer):
         "num_kv_heads": self._num_kv_heads,
         "src_block_size": self._src_block_size,
         "tgt_block_size": self._tgt_block_size,
+        "use_sigmoid_attn": self._use_sigmoid_attn,
+        "sigmoid_attn_bias": self._sigmoid_attn_bias,
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))

official/nlp/modeling/layers/transformer_encoder_block_test.py CHANGED Viewed

@@ -14,6 +14,8 @@
 """Tests for Keras-based transformer block layer."""
+import math
 from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf, tf_keras
@@ -751,7 +753,11 @@ class TransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
         output_tensor[1].shape.as_list(), expected_attention_scores_shape
     )
-  def test_block_sparse_attention(self):
+  @parameterized.named_parameters(
+      ('use_softmax_attn', False),
+      ('use_sigmoid_attn', True),
+  )
+  def test_block_sparse_attention(self, use_sigmoid_attn):
     num_attention_heads = 8
     sequence_length = 21
     width = 80
@@ -765,6 +771,10 @@ class TransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
         return_attention_scores=True,
         src_block_size=src_block_size,
         tgt_block_size=tgt_block_size,
+        use_sigmoid_attn=use_sigmoid_attn,
+        sigmoid_attn_bias=-math.log(sequence_length)
+        if use_sigmoid_attn
+        else None,
     )
     # Create a 3-dimensional input (the first dimension is implicit).
     data_tensor = tf_keras.Input(shape=(sequence_length, width))

{tf_models_nightly-2.18.0.dev20240912.dist-info → tf_models_nightly-2.18.0.dev20240914.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tf-models-nightly
-Version: 2.18.0.dev20240912
+Version: 2.18.0.dev20240914
 Summary: TensorFlow Official Models
 Home-page: https://github.com/tensorflow/models
 Author: Google Inc.

{tf_models_nightly-2.18.0.dev20240912.dist-info → tf_models_nightly-2.18.0.dev20240914.dist-info}/RECORD RENAMED Viewed

@@ -305,8 +305,8 @@ official/nlp/modeling/layers/bigbird_attention.py,sha256=dzutgRoQt2DFsYMpMILv_QF
 official/nlp/modeling/layers/bigbird_attention_test.py,sha256=cBYwK5k1rnykZ0gif-n7VaByLIoElA-N0_svCRKASoU,2206
 official/nlp/modeling/layers/block_diag_feedforward.py,sha256=FDEt-J_QjOxwar3eT5yjMs4hR41Ppke1zj7iswsZR4M,7243
 official/nlp/modeling/layers/block_diag_feedforward_test.py,sha256=wcg8In6FIOCxcKqe5rucftjJ_kUWTi9Ei7eEmlVCYpE,4181
-official/nlp/modeling/layers/block_sparse_attention.py,sha256=Vjy0JULOb9u6-EzD460kXCotsibqyD29imlmrb7aVSY,7580
-official/nlp/modeling/layers/block_sparse_attention_test.py,sha256=YF2_-I27INUFtu-WP7s7C1kpYmsobNIGOWM1iUvSD5Y,12041
+official/nlp/modeling/layers/block_sparse_attention.py,sha256=eY6jkSI-TrnL0JkP_9B-0DCxzppZdK_c8qp6Uw6yiD0,9923
+official/nlp/modeling/layers/block_sparse_attention_test.py,sha256=KSQENNhRG7Y1qDpdW_O3Ws6nPC4se7zv1UcxF2o7blI,15037
 official/nlp/modeling/layers/cls_head.py,sha256=0X_gdjnAt6TZVrH_xkDcQCpwLuVz5Pb7d04wEVN_Kn8,16208
 official/nlp/modeling/layers/cls_head_test.py,sha256=01oMmiuyp1lDEXBYa9r3krn6BtH-QuSedGOca9LViEc,8888
 official/nlp/modeling/layers/factorized_embedding.py,sha256=4oFRYJbpoaSxqv8hTWY2JPGPllp-zhniz99IyRtlzV8,2902
@@ -363,8 +363,8 @@ official/nlp/modeling/layers/tn_expand_condense_test.py,sha256=J52mXzoiuaXfR61kh
 official/nlp/modeling/layers/tn_transformer_expand_condense.py,sha256=gbGJOrgxJd1SyMGB6ME04FSxuZfHqsi94Xxt23l7368,11032
 official/nlp/modeling/layers/tn_transformer_test.py,sha256=Fh-EDRoAkhO7ccD3w3FsJHC51MnZySv8jBlHYnvKZMc,8893
 official/nlp/modeling/layers/transformer.py,sha256=yofIEOjZpcvDmHbcjBmkZrl5iSe6pLtMsetNbXmxDnY,20087
-official/nlp/modeling/layers/transformer_encoder_block.py,sha256=9EuAsedY35eIFc4z-22QQ4c47NHrEe8-8uzjtPfgNTM,21977
-official/nlp/modeling/layers/transformer_encoder_block_test.py,sha256=chs8-M69Gx_Zcp7Pi7sNKjpWgyuSHDw_fNrRh6URPLc,30686
+official/nlp/modeling/layers/transformer_encoder_block.py,sha256=n7_HgFjCye7ZNxzQ67CtgboDKPIE-28796Y2aW8Zk_U,22566
+official/nlp/modeling/layers/transformer_encoder_block_test.py,sha256=5B_h8iNweUiRJR2IH1zxFelsfhVPEJJ4dEzL_pHPjI0,30968
 official/nlp/modeling/layers/transformer_scaffold.py,sha256=m8TF4geBkm8-VJQiTpzMI6FSJZry6oa2vPO3FXCCClE,15704
 official/nlp/modeling/layers/transformer_scaffold_test.py,sha256=pqUGldhmAKROrd4eoCWmHNtKOdCO6PH_-EigcYnvIpE,19920
 official/nlp/modeling/layers/transformer_test.py,sha256=kC_9NcLbJnBbuTaE_7BW60EF8xG_QUoICj0t0gS7O4Q,5522
@@ -1222,9 +1222,9 @@ tensorflow_models/tensorflow_models_test.py,sha256=nc6A9K53OGqF25xN5St8EiWvdVbda
 tensorflow_models/nlp/__init__.py,sha256=4tA5Pf4qaFwT-fIFOpX7x7FHJpnyJT-5UgOeFYTyMlc,807
 tensorflow_models/uplift/__init__.py,sha256=mqfa55gweOdpKoaQyid4A_4u7xw__FcQeSIF0k_pYmI,999
 tensorflow_models/vision/__init__.py,sha256=zBorY_v5xva1uI-qxhZO3Qh-Dii-Suq6wEYh6hKHDfc,833
-tf_models_nightly-2.18.0.dev20240912.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
-tf_models_nightly-2.18.0.dev20240912.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
-tf_models_nightly-2.18.0.dev20240912.dist-info/METADATA,sha256=DSdv3ZNz6oi2xj_9C5HPYHoh0h1dxv8tz_LER44-4Ms,1432
-tf_models_nightly-2.18.0.dev20240912.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
-tf_models_nightly-2.18.0.dev20240912.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
-tf_models_nightly-2.18.0.dev20240912.dist-info/RECORD,,
+tf_models_nightly-2.18.0.dev20240914.dist-info/AUTHORS,sha256=1dG3fXVu9jlo7bul8xuix5F5vOnczMk7_yWn4y70uw0,337
+tf_models_nightly-2.18.0.dev20240914.dist-info/LICENSE,sha256=WxeBS_DejPZQabxtfMOM_xn8qoZNJDQjrT7z2wG1I4U,11512
+tf_models_nightly-2.18.0.dev20240914.dist-info/METADATA,sha256=tcenYrqEbPZvYOYLGuM1b4NLtuby7duThrpqua_darc,1432
+tf_models_nightly-2.18.0.dev20240914.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
+tf_models_nightly-2.18.0.dev20240914.dist-info/top_level.txt,sha256=gum2FfO5R4cvjl2-QtP-S1aNmsvIZaFFT6VFzU0f4-g,33
+tf_models_nightly-2.18.0.dev20240914.dist-info/RECORD,,

{tf_models_nightly-2.18.0.dev20240912.dist-info → tf_models_nightly-2.18.0.dev20240914.dist-info}/AUTHORS RENAMED Viewed

File without changes

{tf_models_nightly-2.18.0.dev20240912.dist-info → tf_models_nightly-2.18.0.dev20240914.dist-info}/LICENSE RENAMED Viewed

File without changes

{tf_models_nightly-2.18.0.dev20240912.dist-info → tf_models_nightly-2.18.0.dev20240914.dist-info}/WHEEL RENAMED Viewed

File without changes

{tf_models_nightly-2.18.0.dev20240912.dist-info → tf_models_nightly-2.18.0.dev20240914.dist-info}/top_level.txt RENAMED Viewed

File without changes

tf-models-nightly 2.18.0.dev20240912__py2.py3-none-any.whl → 2.18.0.dev20240914__py2.py3-none-any.whl

tf-models-nightly 2.18.0.dev20240912py2.py3-none-any.whl → 2.18.0.dev20240914py2.py3-none-any.whl