PyPI - keras-hub-nightly - Versions diffs - 0.19.0.dev202412120352__py3-none-any.whl → 0.19.0.dev202412140350__py3-none-any.whl - Mend

keras-hub-nightly 0.19.0.dev202412120352py3-none-any.whl → 0.19.0.dev202412140350py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

keras_hub/src/models/vit/vit_layers.py ADDED Viewed

@@ -0,0 +1,391 @@
+import keras
+from keras import ops
+from keras_hub.src.utils.keras_utils import standardize_data_format
+class MLP(keras.layers.Layer):
+    """Multi-Layer Perceptron (MLP) block.
+    Args:
+        hidden_dim: int. Dimensionality of the hidden representations.
+        mlp_dim: int. Dimensionality of the intermediate MLP layer.
+        use_bias: bool. Whether to use bias in the dense layers. Defaults to
+            `True`.
+        dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to `0.0`.
+        **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
+    """
+    def __init__(
+        self,
+        hidden_dim,
+        mlp_dim,
+        use_bias=True,
+        dropout_rate=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # === Config ===
+        self.hidden_dim = hidden_dim
+        self.mlp_dim = mlp_dim
+        self.use_bias = use_bias
+        self.dropout_rate = dropout_rate
+    def build(self, input_shape):
+        self.dense_1 = keras.layers.Dense(
+            units=self.mlp_dim,
+            use_bias=self.use_bias,
+            activation="gelu",
+            bias_initializer=(
+                keras.initializers.RandomNormal(stddev=1e-6)
+                if self.use_bias
+                else None
+            ),
+            dtype=self.dtype_policy,
+            name="dense_1",
+        )
+        self.dense_1.build(input_shape)
+        self.dense_2 = keras.layers.Dense(
+            units=self.hidden_dim,
+            use_bias=self.use_bias,
+            bias_initializer=(
+                keras.initializers.RandomNormal(stddev=1e-6)
+                if self.use_bias
+                else None
+            ),
+            dtype=self.dtype_policy,
+            name="dense_2",
+        )
+        self.dense_2.build((None, None, self.mlp_dim))
+        self.dropout = keras.layers.Dropout(
+            self.dropout_rate, dtype=self.dtype_policy, name="dropout"
+        )
+        self.built = True
+    def call(self, inputs):
+        x = self.dense_1(inputs)
+        x = self.dense_2(x)
+        out = self.dropout(x)
+        return out
+class ViTPatchingAndEmbedding(keras.layers.Layer):
+    """Patches the image and embeds the patches.
+    Args:
+        image_size: int. Size of the input image (height or width).
+            Assumed to be square.
+        patch_size: int. Size of each image patch.
+        hidden_dim: int. Dimensionality of the patch embeddings.
+        num_channels: int. Number of channels in the input image. Defaults to
+            `3`.
+        data_format: str. `"channels_last"` or `"channels_first"`. Defaults to
+            `None` (which uses `"channels_last"`).
+        **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
+    """
+    def __init__(
+        self,
+        image_size,
+        patch_size,
+        hidden_dim,
+        num_channels=3,
+        data_format=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        num_patches = (image_size // patch_size) ** 2
+        num_positions = num_patches + 1
+        # === Config ===
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_dim = hidden_dim
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.num_positions = num_positions
+        self.data_format = standardize_data_format(data_format)
+    def build(self, input_shape):
+        self.class_token = self.add_weight(
+            shape=(
+                1,
+                1,
+                self.hidden_dim,
+            ),
+            initializer="random_normal",
+            dtype=self.variable_dtype,
+            name="class_token",
+        )
+        self.patch_embedding = keras.layers.Conv2D(
+            filters=self.hidden_dim,
+            kernel_size=self.patch_size,
+            strides=self.patch_size,
+            padding="valid",
+            activation=None,
+            dtype=self.dtype_policy,
+            data_format=self.data_format,
+            name="patch_embedding",
+        )
+        self.patch_embedding.build(input_shape)
+        self.position_embedding = keras.layers.Embedding(
+            self.num_positions,
+            self.hidden_dim,
+            dtype=self.dtype_policy,
+            embeddings_initializer=keras.initializers.RandomNormal(stddev=0.02),
+            name="position_embedding",
+        )
+        self.position_embedding.build((1, self.num_positions))
+        self.position_ids = keras.ops.expand_dims(
+            keras.ops.arange(self.num_positions), axis=0
+        )
+        self.built = True
+    def call(self, inputs):
+        patch_embeddings = self.patch_embedding(inputs)
+        if self.data_format == "channels_first":
+            patch_embeddings = ops.transpose(
+                patch_embeddings, axes=(0, 2, 3, 1)
+            )
+        embeddings_shape = ops.shape(patch_embeddings)
+        patch_embeddings = ops.reshape(
+            patch_embeddings, [embeddings_shape[0], -1, embeddings_shape[-1]]
+        )
+        class_token = ops.tile(self.class_token, (embeddings_shape[0], 1, 1))
+        position_embeddings = self.position_embedding(self.position_ids)
+        embeddings = ops.concatenate([class_token, patch_embeddings], axis=1)
+        return ops.add(embeddings, position_embeddings)
+    def compute_output_shape(self, input_shape):
+        return (
+            input_shape[0],
+            self.num_positions,
+            self.hidden_dim,
+        )
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "image_size": self.image_size,
+                "patch_size": self.patch_size,
+                "hidden_dim": self.hidden_dim,
+                "num_channels": self.num_channels,
+                "num_patches": self.num_patches,
+                "num_positions": self.num_positions,
+            }
+        )
+        return config
+class ViTEncoderBlock(keras.layers.Layer):
+    """Transformer encoder block.
+    Args:
+        num_heads: int. Number of attention heads.
+        hidden_dim: int. Dimensionality of the hidden representations.
+        mlp_dim: int. Dimensionality of the intermediate MLP layer.
+        use_mha_bias: bool. Whether to use bias in the multi-head attention
+            layer. Defaults to `True`.
+        use_mlp_bias: bool. Whether to use bias in the MLP layer. Defaults to
+            `True`.
+        dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to `0.0`.
+        attention_dropout: float. Dropout rate for the attention mechanism.
+            Between 0 and 1. Defaults to `0.0`.
+        layer_norm_epsilon: float. Small float value for layer normalization
+            stability. Defaults to `1e-6`.
+        **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
+    """
+    def __init__(
+        self,
+        num_heads,
+        hidden_dim,
+        mlp_dim,
+        use_mha_bias=True,
+        use_mlp_bias=True,
+        dropout_rate=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        key_dim = hidden_dim // num_heads
+        # === Config ===
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.key_dim = key_dim
+        self.mlp_dim = mlp_dim
+        self.use_mha_bias = use_mha_bias
+        self.use_mlp_bias = use_mlp_bias
+        self.dropout_rate = dropout_rate
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+    def build(self, input_shape):
+        # Attention block
+        self.layer_norm_1 = keras.layers.LayerNormalization(
+            epsilon=self.layer_norm_epsilon,
+            name="ln_1",
+            dtype=self.dtype_policy,
+        )
+        self.layer_norm_1.build(input_shape)
+        self.mha = keras.layers.MultiHeadAttention(
+            num_heads=self.num_heads,
+            key_dim=self.key_dim,
+            use_bias=self.use_mha_bias,
+            dropout=self.attention_dropout,
+            name="mha",
+            dtype=self.dtype_policy,
+        )
+        self.mha.build(input_shape, input_shape)
+        self.dropout = keras.layers.Dropout(
+            self.dropout_rate, dtype=self.dtype_policy, name="dropout"
+        )
+        # MLP block
+        self.layer_norm_2 = keras.layers.LayerNormalization(
+            epsilon=self.layer_norm_epsilon,
+            name="ln_2",
+            dtype=self.dtype_policy,
+        )
+        self.layer_norm_2.build((None, None, self.hidden_dim))
+        self.mlp = MLP(
+            hidden_dim=self.hidden_dim,
+            mlp_dim=self.mlp_dim,
+            use_bias=self.use_mlp_bias,
+            name="mlp",
+            dtype=self.dtype_policy,
+        )
+        self.mlp.build((None, None, self.hidden_dim))
+        self.built = True
+    def call(self, inputs):
+        x = self.layer_norm_1(inputs)
+        x = self.mha(x, x)
+        x = self.dropout(x)
+        x = x + inputs
+        y = self.layer_norm_2(x)
+        y = self.mlp(y)
+        return x + y
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_heads": self.num_heads,
+                "hidden_dim": self.hidden_dim,
+                "key_dim": self.key_dim,
+                "mlp_dim": self.mlp_dim,
+                "use_mha_bias": self.use_mha_bias,
+                "use_mlp_bias": self.use_mlp_bias,
+                "dropout_rate": self.dropout_rate,
+                "attention_dropout": self.attention_dropout,
+                "layer_norm_epsilon": self.layer_norm_epsilon,
+            }
+        )
+        return config
+class ViTEncoder(keras.layers.Layer):
+    """Vision Transformer (ViT) encoder.
+    Args:
+        num_layers: int. Number of Transformer encoder blocks.
+        num_heads: int. Number of attention heads.
+        hidden_dim: int. Dimensionality of the hidden representations.
+        mlp_dim: int. Dimensionality of the intermediate MLP layer.
+        use_mha_bias: bool. Whether to use bias in the multi-head attention
+            layers. Defaults to `True`.
+        use_mlp_bias: bool. Whether to use bias in the MLP layers. Defaults to
+            `True`.
+        dropout_rate: float. Dropout rate. Between 0 and 1. Defaults to `0.0`.
+        attention_dropout: float. Dropout rate for the attention mechanism.
+            Between 0 and 1. Defaults to `0.0`.
+        layer_norm_epsilon: float. Small float value for layer normalization
+            tability. Defaults to `1e-6`.
+        **kwargs: Additional keyword arguments passed to `keras.layers.Layer`
+    """
+    def __init__(
+        self,
+        num_layers,
+        num_heads,
+        hidden_dim,
+        mlp_dim,
+        use_mha_bias=True,
+        use_mlp_bias=True,
+        dropout_rate=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # === config ===
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.mlp_dim = mlp_dim
+        self.use_mha_bias = use_mha_bias
+        self.use_mlp_bias = use_mlp_bias
+        self.dropout_rate = dropout_rate
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+    def build(self, input_shape):
+        self.encoder_layers = []
+        for i in range(self.num_layers):
+            encoder_block = ViTEncoderBlock(
+                num_heads=self.num_heads,
+                hidden_dim=self.hidden_dim,
+                mlp_dim=self.mlp_dim,
+                dropout_rate=self.dropout_rate,
+                use_mha_bias=self.use_mha_bias,
+                use_mlp_bias=self.use_mlp_bias,
+                attention_dropout=self.attention_dropout,
+                layer_norm_epsilon=self.layer_norm_epsilon,
+                dtype=self.dtype_policy,
+                name=f"tranformer_block_{i+1}",
+            )
+            encoder_block.build((None, None, self.hidden_dim))
+            self.encoder_layers.append(encoder_block)
+        self.dropout = keras.layers.Dropout(
+            self.dropout_rate, dtype=self.dtype_policy, name="dropout"
+        )
+        self.layer_norm = keras.layers.LayerNormalization(
+            epsilon=self.layer_norm_epsilon,
+            dtype=self.dtype_policy,
+            name="ln",
+        )
+        self.layer_norm.build((None, None, self.hidden_dim))
+        self.built = True
+    def call(self, inputs):
+        x = self.dropout(inputs)
+        for i in range(self.num_layers):
+            x = self.encoder_layers[i](x)
+        x = self.layer_norm(x)
+        return x
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_layers": self.num_layers,
+                "num_heads": self.num_heads,
+                "hidden_dim": self.hidden_dim,
+                "mlp_dim": self.mlp_dim,
+                "use_mha_bias": self.use_mha_bias,
+                "use_mlp_bias": self.use_mlp_bias,
+                "dropout_rate": self.dropout_rate,
+                "attention_dropout": self.attention_dropout,
+                "layer_norm_epsilon": self.layer_norm_epsilon,
+            }
+        )
+        return config

keras_hub/src/models/vit/vit_presets.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""ViT model preset configurations."""
+# Metadata for loading pretrained model weights.
+backbone_presets = {
+    "vit_base_patch16_224_imagenet": {
+        "metadata": {
+            "description": (
+                "ViT-B16 model pre-trained on the ImageNet 1k dataset with "
+                "image resolution of 224x224 "
+            ),
+            "params": 85798656,
+            "path": "vit",
+        },
+        "kaggle_handle": "kaggle://keras/vit/keras/vit_base_patch16_224_imagenet/1",
+    },
+    "vit_base_patch16_384_imagenet": {
+        "metadata": {
+            "description": (
+                "ViT-B16 model pre-trained on the ImageNet 1k dataset with "
+                "image resolution of 384x384 "
+            ),
+            "params": 86090496,
+            "path": "vit",
+        },
+        "kaggle_handle": "kaggle://keras/vit/keras/vit_base_patch16_384_imagenet/1",
+    },
+    "vit_large_patch16_224_imagenet": {
+        "metadata": {
+            "description": (
+                "ViT-L16 model pre-trained on the ImageNet 1k dataset with "
+                "image resolution of 224x224 "
+            ),
+            "params": 303301632,
+            "path": "vit",
+        },
+        "kaggle_handle": "kaggle://keras/vit/keras/vit_large_patch16_224_imagenet/1",
+    },
+    "vit_large_patch16_384_imagenet": {
+        "metadata": {
+            "description": (
+                "ViT-L16 model pre-trained on the ImageNet 1k dataset with "
+                "image resolution of 384x384 "
+            ),
+            "params": 303690752,
+            "path": "vit",
+        },
+        "kaggle_handle": "kaggle://keras/vit/keras/vit_large_patch16_384_imagenet/1",
+    },
+}

keras_hub/src/models/vit_det/vit_det_backbone.py CHANGED Viewed

@@ -87,7 +87,7 @@ class ViTDetBackbone(Backbone):
         use_rel_pos=True,
         window_size=14,
         layer_norm_epsilon=1e-6,
-        **kwargs
+        **kwargs,
     ):
         # === Functional model ===
         img_input = keras.layers.Input(shape=image_shape, name="images")
@@ -179,7 +179,9 @@ class ViTDetBackbone(Backbone):
                 "use_abs_pos": self.use_abs_pos,
                 "use_rel_pos": self.use_rel_pos,
                 "window_size": self.window_size,
-                "global_attention_layer_indices": self.global_attention_layer_indices,
+                "global_attention_layer_indices": (
+                    self.global_attention_layer_indices
+                ),
                 "layer_norm_epsilon": self.layer_norm_epsilon,
             }
         )

keras_hub/src/models/vit_det/vit_layers.py CHANGED Viewed

@@ -117,7 +117,7 @@ class AddRelativePositionalEmbedding(keras.layers.Layer):
         """Calculate decomposed Relative Positional Embeddings
         The code has been adapted based on
-        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py  # noqa: E501
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
         Args:
             attention_map (tensor): Attention map.
@@ -193,7 +193,7 @@ class MultiHeadAttentionWithRelativePE(keras.layers.Layer):
         use_bias=True,
         use_rel_pos=False,
         input_size=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.num_heads = num_heads
@@ -378,7 +378,7 @@ class WindowedTransformerEncoder(keras.layers.Layer):
         input_size=None,
         activation="gelu",
         layer_norm_epsilon=1e-6,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.project_dim = project_dim

keras_hub/src/models/whisper/whisper_audio_converter.py CHANGED Viewed

@@ -172,9 +172,7 @@ class WhisperAudioConverter(AudioConverter):
         )
         def tf_log10(x):
-            """
-            Computes log base 10 of input tensor using TensorFlow's natural log operator.
-            """
+            """Computes log base 10 of input tensor using TensorFlow."""
             numerator = tf.math.log(x)
             denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
             return numerator / denominator

keras_hub/src/models/whisper/whisper_backbone.py CHANGED Viewed

@@ -30,9 +30,10 @@ class WhisperBackbone(Backbone):
     It includes the embedding lookups and transformer layers, but not the head
     for predicting the next token.
-    The default constructor gives a fully customizable, randomly initialized Whisper
-    model with any number of layers, heads, and embedding dimensions. To load
-    preset architectures and weights, use the `from_preset()` constructor.
+    The default constructor gives a fully customizable, randomly initialized
+    Whisper model with any number of layers, heads, and embedding dimensions.
+    To load preset architectures and weights, use the `from_preset()`
+    constructor.
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
     warranties or conditions of any kind. The underlying model is provided by a
@@ -53,8 +54,8 @@ class WhisperBackbone(Backbone):
         max_encoder_sequence_length: int. The maximum sequence length that the
             audio encoder can consume. Since the second convolutional layer in
             the encoder reduces the sequence length by half (stride of 2), we
-            use `max_encoder_sequence_length // 2` as the sequence length for the
-            positional embedding layer.
+            use `max_encoder_sequence_length // 2` as the sequence length for
+            the positional embedding layer.
         max_decoder_sequence_length: int. The maximum sequence length that the
             text decoder can consume.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use

keras_hub/src/models/whisper/whisper_decoder.py CHANGED Viewed

@@ -14,11 +14,9 @@ class WhisperDecoder(TransformerDecoder):
     """Whisper decoder.
     Inherits from `keras_hub.layers.TransformerDecoder`, and overrides the
-    `build` method to use the
-    `keras_hub.models.whisper.whisper_multi_head_attention.WhisperMultiHeadAttention`
-    layer instead of `keras.layers.MultiHeadAttention` and
-    `keras_hub.models.whisper.whisper_cached_multi_head_attention.WhisperCachedMultiHeadAttention`
-    instead of `keras_hub.layers.cached_multi_head_attention.CachedMultiHeadAttention`.
+    `build` method to use the `WhisperMultiHeadAttention`
+    layer instead of `MultiHeadAttention` and `WhisperCachedMultiHeadAttention`
+    instead of `CachedMultiHeadAttention`.
     """
     def build(

keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py CHANGED Viewed

@@ -9,7 +9,7 @@ from keras_hub.src.models.roberta.roberta_backbone import (
 from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
     XLMRobertaBackbone,
 )
-from keras_hub.src.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import (
+from keras_hub.src.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import (  # noqa: E501
     XLMRobertaMaskedLMPreprocessor,
 )

keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py CHANGED Viewed

@@ -20,8 +20,8 @@ class XLMRobertaMaskedLMPreprocessor(MaskedLMPreprocessor):
     This preprocessing layer will prepare inputs for a masked language modeling
     task. It is primarily intended for use with the
-    `keras_hub.models.XLMRobertaMaskedLM` task model. Preprocessing will occur in
-    multiple steps.
+    `keras_hub.models.XLMRobertaMaskedLM` task model. Preprocessing will occur
+    in multiple steps.
     1. Tokenize any number of input segments using the `tokenizer`.
     2. Pack the inputs together with the appropriate `"<s>"`, `"</s>"` and

keras_hub/src/models/xlm_roberta/xlm_roberta_text_classifier.py CHANGED Viewed

@@ -8,7 +8,7 @@ from keras_hub.src.models.text_classifier import TextClassifier
 from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
     XLMRobertaBackbone,
 )
-from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier_preprocessor import (
+from keras_hub.src.models.xlm_roberta.xlm_roberta_text_classifier_preprocessor import (  # noqa: E501
     XLMRobertaTextClassifierPreprocessor,
 )
@@ -40,9 +40,9 @@ class XLMRobertaTextClassifier(TextClassifier):
     Args:
         backbone: A `keras_hub.models.XLMRobertaBackbone` instance.
         num_classes: int. Number of classes to predict.
-        preprocessor: A `keras_hub.models.XLMRobertaTextClassifierPreprocessor` or `None`. If
-            `None`, this model will not apply preprocessing, and inputs should
-            be preprocessed before calling the model.
+        preprocessor: A `keras_hub.models.XLMRobertaTextClassifierPreprocessor`
+            or `None`. If `None`, this model will not apply preprocessing, and
+            inputs should be preprocessed before calling the model.
         activation: Optional `str` or callable. The activation function to use
             on the model outputs. Set `activation="softmax"` to return output
             probabilities. Defaults to `None`.

keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py CHANGED Viewed

@@ -177,7 +177,8 @@ class XLMRobertaTokenizer(SentencePieceTokenizer):
         # Shift the tokens IDs left by one.
         tokens = tf.subtract(tokens, 1)
-        # Correct `unk_token_id`, `end_token_id`, `start_token_id`, respectively.
+        # Correct `unk_token_id`, `end_token_id`, `start_token_id`,
+        # respectively.
         # Note: The `pad_token_id` is taken as 0 (`unk_token_id`) since the
         # proto does not contain `pad_token_id`. This mapping of the pad token
         # is done automatically by the above subtraction.

keras_hub/src/models/xlnet/relative_attention.py CHANGED Viewed

@@ -64,27 +64,28 @@ def _rel_shift(x, klen=-1):
 class TwoStreamRelativeAttention(keras.layers.MultiHeadAttention):
     """Two-stream relative self-attention for XLNet.
-    In XLNet, each token has two associated vectors at each self-attention layer,
-    the content stream (h) and the query stream (g). The content stream is the
-    self-attention stream as in Transformer XL and represents the context and
-    content (the token itself). The query stream only has access to contextual
-    information and the position, but not the content.
+    In XLNet, each token has two associated vectors at each self-attention
+    layer, the content stream (h) and the query stream (g). The content stream
+    is the self-attention stream as in Transformer XL and represents the context
+    and content (the token itself). The query stream only has access to
+    contextual information and the position, but not the content.
-    This layer shares the same build signature as `keras.layers.MultiHeadAttention`
-    but has different input/output projections.
+    This layer shares the same build signature as
+    `keras.layers.MultiHeadAttention` but has different input/output
+    projections.
     We use the notations `B`, `T`, `S`, `M`, `L`, `E`, `P`, `dim`, `num_heads`
-    below, where
-    `B` is the batch dimension, `T` is the target sequence length,
+    below, where `B` is the batch dimension, `T` is the target sequence length,
     `S` in the source sequence length, `M` is the length of the state or memory,
     `L` is the length of relative positional encoding, `E` is the last dimension
-    of query input, `P` is the number of predictions, `dim` is the dimensionality
-    of the encoder layers. and `num_heads` is the number of attention heads.
+    of query input, `P` is the number of predictions, `dim` is the
+    dimensionality of the encoder layers. and `num_heads` is the number of
+    attention heads.
     Args:
         content_stream: `Tensor` of shape `[B, T, dim]`.
-        content_attention_bias: Bias `Tensor` for content based attention of shape
-            `[num_heads, dim]`.
+        content_attention_bias: Bias `Tensor` for content based attention of
+            shape `[num_heads, dim]`.
         positional_attention_bias: Bias `Tensor` for position based attention of
             shape `[num_heads, dim]`.
         query_stream: `Tensor` of shape `[B, P, dim]`.
@@ -96,8 +97,8 @@ class TwoStreamRelativeAttention(keras.layers.MultiHeadAttention):
         segment_encoding: Optional `Tensor` representing the segmentation
             encoding as used in XLNet of shape `[2, num_heads, dim]`.
         segment_attention_bias: Optional trainable bias parameter added to the
-            query had when calculating the segment-based attention score used
-            in XLNet of shape `[num_heads, dim]`.
+            query had when calculating the segment-based attention score used in
+            XLNet of shape `[num_heads, dim]`.
         state: Optional `Tensor` of shape `[B, M, E]`.
             If passed, this is also attended over as in Transformer XL.
         content_attention_mask: a boolean mask of shape `[B, T, S]` that
@@ -336,11 +337,11 @@ class TwoStreamRelativeAttention(keras.layers.MultiHeadAttention):
         dimension of query input.
         Args:
-            content_stream: The content representation, commonly referred to as h.
-                This serves a similar role to the standard hidden states in
+            content_stream: The content representation, commonly referred to as
+                h. This serves a similar role to the standard hidden states in
                 Transformer-XL.
-            content_attention_bias: A trainable bias parameter added to the query
-                head when calculating the content-based attention score.
+            content_attention_bias: A trainable bias parameter added to the
+                query head when calculating the content-based attention score.
             positional_attention_bias: A trainable bias parameter added to the
                 query head when calculating the position-based attention score.
             query_stream: The query representation, commonly referred to as g.

keras_hub/src/models/xlnet/xlnet_backbone.py CHANGED Viewed

@@ -49,8 +49,8 @@ class XLNetBackbone(Backbone):
             `[batch_size, sequence_length]`.
         segment_ids: Segment token indices to indicate first and second portions
             of the inputs of shape `[batch_size, sequence_length]`.
-        padding_mask: Mask to avoid performing attention on padding token indices
-            of shape `[batch_size, sequence_length]`.
+        padding_mask: Mask to avoid performing attention on padding token
+            indices of shape `[batch_size, sequence_length]`.
     Example:
     ```python

keras-hub-nightly 0.19.0.dev202412120352__py3-none-any.whl → 0.19.0.dev202412140350__py3-none-any.whl

keras-hub-nightly 0.19.0.dev202412120352py3-none-any.whl → 0.19.0.dev202412140350py3-none-any.whl