PyPI - keras-hub-nightly - Versions diffs - 0.15.0.dev20240823171555__py3-none-any.whl - Mend

keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (297) hide show

keras_hub/__init__.py +52 -0
keras_hub/api/__init__.py +27 -0
keras_hub/api/layers/__init__.py +47 -0
keras_hub/api/metrics/__init__.py +24 -0
keras_hub/api/models/__init__.py +249 -0
keras_hub/api/samplers/__init__.py +29 -0
keras_hub/api/tokenizers/__init__.py +35 -0
keras_hub/src/__init__.py +13 -0
keras_hub/src/api_export.py +53 -0
keras_hub/src/layers/__init__.py +13 -0
keras_hub/src/layers/modeling/__init__.py +13 -0
keras_hub/src/layers/modeling/alibi_bias.py +143 -0
keras_hub/src/layers/modeling/cached_multi_head_attention.py +137 -0
keras_hub/src/layers/modeling/f_net_encoder.py +200 -0
keras_hub/src/layers/modeling/masked_lm_head.py +239 -0
keras_hub/src/layers/modeling/position_embedding.py +123 -0
keras_hub/src/layers/modeling/reversible_embedding.py +311 -0
keras_hub/src/layers/modeling/rotary_embedding.py +169 -0
keras_hub/src/layers/modeling/sine_position_encoding.py +108 -0
keras_hub/src/layers/modeling/token_and_position_embedding.py +150 -0
keras_hub/src/layers/modeling/transformer_decoder.py +496 -0
keras_hub/src/layers/modeling/transformer_encoder.py +262 -0
keras_hub/src/layers/modeling/transformer_layer_utils.py +106 -0
keras_hub/src/layers/preprocessing/__init__.py +13 -0
keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +220 -0
keras_hub/src/layers/preprocessing/multi_segment_packer.py +319 -0
keras_hub/src/layers/preprocessing/preprocessing_layer.py +62 -0
keras_hub/src/layers/preprocessing/random_deletion.py +271 -0
keras_hub/src/layers/preprocessing/random_swap.py +267 -0
keras_hub/src/layers/preprocessing/start_end_packer.py +219 -0
keras_hub/src/metrics/__init__.py +13 -0
keras_hub/src/metrics/bleu.py +394 -0
keras_hub/src/metrics/edit_distance.py +197 -0
keras_hub/src/metrics/perplexity.py +181 -0
keras_hub/src/metrics/rouge_base.py +204 -0
keras_hub/src/metrics/rouge_l.py +97 -0
keras_hub/src/metrics/rouge_n.py +125 -0
keras_hub/src/models/__init__.py +13 -0
keras_hub/src/models/albert/__init__.py +20 -0
keras_hub/src/models/albert/albert_backbone.py +267 -0
keras_hub/src/models/albert/albert_classifier.py +202 -0
keras_hub/src/models/albert/albert_masked_lm.py +129 -0
keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/albert/albert_preprocessor.py +206 -0
keras_hub/src/models/albert/albert_presets.py +70 -0
keras_hub/src/models/albert/albert_tokenizer.py +119 -0
keras_hub/src/models/backbone.py +311 -0
keras_hub/src/models/bart/__init__.py +20 -0
keras_hub/src/models/bart/bart_backbone.py +261 -0
keras_hub/src/models/bart/bart_preprocessor.py +276 -0
keras_hub/src/models/bart/bart_presets.py +74 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm.py +490 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +262 -0
keras_hub/src/models/bart/bart_tokenizer.py +124 -0
keras_hub/src/models/bert/__init__.py +23 -0
keras_hub/src/models/bert/bert_backbone.py +227 -0
keras_hub/src/models/bert/bert_classifier.py +183 -0
keras_hub/src/models/bert/bert_masked_lm.py +131 -0
keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/bert/bert_preprocessor.py +184 -0
keras_hub/src/models/bert/bert_presets.py +147 -0
keras_hub/src/models/bert/bert_tokenizer.py +112 -0
keras_hub/src/models/bloom/__init__.py +20 -0
keras_hub/src/models/bloom/bloom_attention.py +186 -0
keras_hub/src/models/bloom/bloom_backbone.py +173 -0
keras_hub/src/models/bloom/bloom_causal_lm.py +298 -0
keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +176 -0
keras_hub/src/models/bloom/bloom_decoder.py +206 -0
keras_hub/src/models/bloom/bloom_preprocessor.py +185 -0
keras_hub/src/models/bloom/bloom_presets.py +121 -0
keras_hub/src/models/bloom/bloom_tokenizer.py +116 -0
keras_hub/src/models/causal_lm.py +383 -0
keras_hub/src/models/classifier.py +109 -0
keras_hub/src/models/csp_darknet/__init__.py +13 -0
keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +410 -0
keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +133 -0
keras_hub/src/models/deberta_v3/__init__.py +24 -0
keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +210 -0
keras_hub/src/models/deberta_v3/deberta_v3_classifier.py +228 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm.py +135 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +191 -0
keras_hub/src/models/deberta_v3/deberta_v3_preprocessor.py +206 -0
keras_hub/src/models/deberta_v3/deberta_v3_presets.py +82 -0
keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +155 -0
keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +227 -0
keras_hub/src/models/deberta_v3/disentangled_self_attention.py +412 -0
keras_hub/src/models/deberta_v3/relative_embedding.py +94 -0
keras_hub/src/models/densenet/__init__.py +13 -0
keras_hub/src/models/densenet/densenet_backbone.py +210 -0
keras_hub/src/models/densenet/densenet_image_classifier.py +131 -0
keras_hub/src/models/distil_bert/__init__.py +26 -0
keras_hub/src/models/distil_bert/distil_bert_backbone.py +187 -0
keras_hub/src/models/distil_bert/distil_bert_classifier.py +208 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +137 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/distil_bert/distil_bert_preprocessor.py +175 -0
keras_hub/src/models/distil_bert/distil_bert_presets.py +57 -0
keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +114 -0
keras_hub/src/models/electra/__init__.py +20 -0
keras_hub/src/models/electra/electra_backbone.py +247 -0
keras_hub/src/models/electra/electra_preprocessor.py +154 -0
keras_hub/src/models/electra/electra_presets.py +95 -0
keras_hub/src/models/electra/electra_tokenizer.py +104 -0
keras_hub/src/models/f_net/__init__.py +20 -0
keras_hub/src/models/f_net/f_net_backbone.py +236 -0
keras_hub/src/models/f_net/f_net_classifier.py +154 -0
keras_hub/src/models/f_net/f_net_masked_lm.py +132 -0
keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +196 -0
keras_hub/src/models/f_net/f_net_preprocessor.py +177 -0
keras_hub/src/models/f_net/f_net_presets.py +43 -0
keras_hub/src/models/f_net/f_net_tokenizer.py +95 -0
keras_hub/src/models/falcon/__init__.py +20 -0
keras_hub/src/models/falcon/falcon_attention.py +156 -0
keras_hub/src/models/falcon/falcon_backbone.py +164 -0
keras_hub/src/models/falcon/falcon_causal_lm.py +291 -0
keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/falcon/falcon_preprocessor.py +187 -0
keras_hub/src/models/falcon/falcon_presets.py +30 -0
keras_hub/src/models/falcon/falcon_tokenizer.py +110 -0
keras_hub/src/models/falcon/falcon_transformer_decoder.py +255 -0
keras_hub/src/models/feature_pyramid_backbone.py +73 -0
keras_hub/src/models/gemma/__init__.py +20 -0
keras_hub/src/models/gemma/gemma_attention.py +250 -0
keras_hub/src/models/gemma/gemma_backbone.py +316 -0
keras_hub/src/models/gemma/gemma_causal_lm.py +448 -0
keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +167 -0
keras_hub/src/models/gemma/gemma_decoder_block.py +241 -0
keras_hub/src/models/gemma/gemma_preprocessor.py +191 -0
keras_hub/src/models/gemma/gemma_presets.py +248 -0
keras_hub/src/models/gemma/gemma_tokenizer.py +103 -0
keras_hub/src/models/gemma/rms_normalization.py +40 -0
keras_hub/src/models/gpt2/__init__.py +20 -0
keras_hub/src/models/gpt2/gpt2_backbone.py +199 -0
keras_hub/src/models/gpt2/gpt2_causal_lm.py +437 -0
keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/gpt2/gpt2_preprocessor.py +187 -0
keras_hub/src/models/gpt2/gpt2_presets.py +82 -0
keras_hub/src/models/gpt2/gpt2_tokenizer.py +110 -0
keras_hub/src/models/gpt_neo_x/__init__.py +13 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +251 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +175 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +201 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +141 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +258 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +145 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +88 -0
keras_hub/src/models/image_classifier.py +90 -0
keras_hub/src/models/llama/__init__.py +20 -0
keras_hub/src/models/llama/llama_attention.py +225 -0
keras_hub/src/models/llama/llama_backbone.py +188 -0
keras_hub/src/models/llama/llama_causal_lm.py +327 -0
keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +170 -0
keras_hub/src/models/llama/llama_decoder.py +246 -0
keras_hub/src/models/llama/llama_layernorm.py +48 -0
keras_hub/src/models/llama/llama_preprocessor.py +189 -0
keras_hub/src/models/llama/llama_presets.py +80 -0
keras_hub/src/models/llama/llama_tokenizer.py +84 -0
keras_hub/src/models/llama3/__init__.py +20 -0
keras_hub/src/models/llama3/llama3_backbone.py +84 -0
keras_hub/src/models/llama3/llama3_causal_lm.py +46 -0
keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/llama3/llama3_preprocessor.py +21 -0
keras_hub/src/models/llama3/llama3_presets.py +69 -0
keras_hub/src/models/llama3/llama3_tokenizer.py +63 -0
keras_hub/src/models/masked_lm.py +101 -0
keras_hub/src/models/mistral/__init__.py +20 -0
keras_hub/src/models/mistral/mistral_attention.py +238 -0
keras_hub/src/models/mistral/mistral_backbone.py +203 -0
keras_hub/src/models/mistral/mistral_causal_lm.py +328 -0
keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +175 -0
keras_hub/src/models/mistral/mistral_layer_norm.py +48 -0
keras_hub/src/models/mistral/mistral_preprocessor.py +190 -0
keras_hub/src/models/mistral/mistral_presets.py +48 -0
keras_hub/src/models/mistral/mistral_tokenizer.py +82 -0
keras_hub/src/models/mistral/mistral_transformer_decoder.py +265 -0
keras_hub/src/models/mix_transformer/__init__.py +13 -0
keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +181 -0
keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +133 -0
keras_hub/src/models/mix_transformer/mix_transformer_layers.py +300 -0
keras_hub/src/models/opt/__init__.py +20 -0
keras_hub/src/models/opt/opt_backbone.py +173 -0
keras_hub/src/models/opt/opt_causal_lm.py +301 -0
keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +177 -0
keras_hub/src/models/opt/opt_preprocessor.py +188 -0
keras_hub/src/models/opt/opt_presets.py +72 -0
keras_hub/src/models/opt/opt_tokenizer.py +116 -0
keras_hub/src/models/pali_gemma/__init__.py +23 -0
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +277 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +313 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +147 -0
keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +160 -0
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +78 -0
keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +79 -0
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +566 -0
keras_hub/src/models/phi3/__init__.py +20 -0
keras_hub/src/models/phi3/phi3_attention.py +260 -0
keras_hub/src/models/phi3/phi3_backbone.py +224 -0
keras_hub/src/models/phi3/phi3_causal_lm.py +218 -0
keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/phi3/phi3_decoder.py +260 -0
keras_hub/src/models/phi3/phi3_layernorm.py +48 -0
keras_hub/src/models/phi3/phi3_preprocessor.py +190 -0
keras_hub/src/models/phi3/phi3_presets.py +50 -0
keras_hub/src/models/phi3/phi3_rotary_embedding.py +137 -0
keras_hub/src/models/phi3/phi3_tokenizer.py +94 -0
keras_hub/src/models/preprocessor.py +207 -0
keras_hub/src/models/resnet/__init__.py +13 -0
keras_hub/src/models/resnet/resnet_backbone.py +612 -0
keras_hub/src/models/resnet/resnet_image_classifier.py +136 -0
keras_hub/src/models/roberta/__init__.py +20 -0
keras_hub/src/models/roberta/roberta_backbone.py +184 -0
keras_hub/src/models/roberta/roberta_classifier.py +209 -0
keras_hub/src/models/roberta/roberta_masked_lm.py +136 -0
keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/roberta/roberta_preprocessor.py +192 -0
keras_hub/src/models/roberta/roberta_presets.py +43 -0
keras_hub/src/models/roberta/roberta_tokenizer.py +132 -0
keras_hub/src/models/seq_2_seq_lm.py +54 -0
keras_hub/src/models/t5/__init__.py +20 -0
keras_hub/src/models/t5/t5_backbone.py +261 -0
keras_hub/src/models/t5/t5_layer_norm.py +35 -0
keras_hub/src/models/t5/t5_multi_head_attention.py +324 -0
keras_hub/src/models/t5/t5_presets.py +95 -0
keras_hub/src/models/t5/t5_tokenizer.py +100 -0
keras_hub/src/models/t5/t5_transformer_layer.py +178 -0
keras_hub/src/models/task.py +419 -0
keras_hub/src/models/vgg/__init__.py +13 -0
keras_hub/src/models/vgg/vgg_backbone.py +158 -0
keras_hub/src/models/vgg/vgg_image_classifier.py +124 -0
keras_hub/src/models/vit_det/__init__.py +13 -0
keras_hub/src/models/vit_det/vit_det_backbone.py +204 -0
keras_hub/src/models/vit_det/vit_layers.py +565 -0
keras_hub/src/models/whisper/__init__.py +20 -0
keras_hub/src/models/whisper/whisper_audio_feature_extractor.py +260 -0
keras_hub/src/models/whisper/whisper_backbone.py +305 -0
keras_hub/src/models/whisper/whisper_cached_multi_head_attention.py +153 -0
keras_hub/src/models/whisper/whisper_decoder.py +141 -0
keras_hub/src/models/whisper/whisper_encoder.py +106 -0
keras_hub/src/models/whisper/whisper_preprocessor.py +326 -0
keras_hub/src/models/whisper/whisper_presets.py +148 -0
keras_hub/src/models/whisper/whisper_tokenizer.py +163 -0
keras_hub/src/models/xlm_roberta/__init__.py +26 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_backbone.py +81 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_classifier.py +225 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +141 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +195 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_preprocessor.py +205 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +43 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +191 -0
keras_hub/src/models/xlnet/__init__.py +13 -0
keras_hub/src/models/xlnet/relative_attention.py +459 -0
keras_hub/src/models/xlnet/xlnet_backbone.py +222 -0
keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +133 -0
keras_hub/src/models/xlnet/xlnet_encoder.py +378 -0
keras_hub/src/samplers/__init__.py +13 -0
keras_hub/src/samplers/beam_sampler.py +207 -0
keras_hub/src/samplers/contrastive_sampler.py +231 -0
keras_hub/src/samplers/greedy_sampler.py +50 -0
keras_hub/src/samplers/random_sampler.py +77 -0
keras_hub/src/samplers/sampler.py +237 -0
keras_hub/src/samplers/serialization.py +97 -0
keras_hub/src/samplers/top_k_sampler.py +92 -0
keras_hub/src/samplers/top_p_sampler.py +113 -0
keras_hub/src/tests/__init__.py +13 -0
keras_hub/src/tests/test_case.py +608 -0
keras_hub/src/tokenizers/__init__.py +13 -0
keras_hub/src/tokenizers/byte_pair_tokenizer.py +638 -0
keras_hub/src/tokenizers/byte_tokenizer.py +299 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +267 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +150 -0
keras_hub/src/tokenizers/tokenizer.py +235 -0
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +355 -0
keras_hub/src/tokenizers/word_piece_tokenizer.py +544 -0
keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +176 -0
keras_hub/src/utils/__init__.py +13 -0
keras_hub/src/utils/keras_utils.py +130 -0
keras_hub/src/utils/pipeline_model.py +293 -0
keras_hub/src/utils/preset_utils.py +621 -0
keras_hub/src/utils/python_utils.py +21 -0
keras_hub/src/utils/tensor_utils.py +206 -0
keras_hub/src/utils/timm/__init__.py +13 -0
keras_hub/src/utils/timm/convert.py +37 -0
keras_hub/src/utils/timm/convert_resnet.py +171 -0
keras_hub/src/utils/transformers/__init__.py +13 -0
keras_hub/src/utils/transformers/convert.py +101 -0
keras_hub/src/utils/transformers/convert_bert.py +173 -0
keras_hub/src/utils/transformers/convert_distilbert.py +184 -0
keras_hub/src/utils/transformers/convert_gemma.py +187 -0
keras_hub/src/utils/transformers/convert_gpt2.py +186 -0
keras_hub/src/utils/transformers/convert_llama3.py +136 -0
keras_hub/src/utils/transformers/convert_pali_gemma.py +303 -0
keras_hub/src/utils/transformers/safetensor_utils.py +97 -0
keras_hub/src/version_utils.py +23 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +34 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +297 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/WHEEL +5 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/top_level.txt +1 -0

keras_hub/src/layers/modeling/transformer_decoder.py ADDED Viewed

@@ -0,0 +1,496 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import keras
+from keras import ops
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.layers.modeling.cached_multi_head_attention import (
+    CachedMultiHeadAttention,
+)
+from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.layers.modeling.transformer_layer_utils import (  # isort:skip
+    compute_causal_mask,
+    merge_padding_and_attention_mask,
+)
+@keras_hub_export("keras_hub.layers.TransformerDecoder")
+class TransformerDecoder(keras.layers.Layer):
+    """Transformer decoder.
+    This class follows the architecture of the transformer decoder layer in the
+    paper [Attention is All You Need](https://arxiv.org/abs/1706.03762). Users
+    can instantiate multiple instances of this class to stack up a decoder.
+    By default, this layer will apply a causal mask to the decoder attention
+    layer. You can also pass padding or attention masks directly to the layer
+    during call, e.g. with `decoder_padding_mask` or `decoder_attention_mask`.
+    This layer can be called with either one or two inputs. The number of inputs
+    must be consistent across all calls. The options are as follows:
+        `layer(decoder_sequence)`: no cross-attention will be built into the
+            decoder block. This is useful when building a "decoder-only"
+            transformer such as GPT-2.
+        `layer(decoder_sequence, encoder_sequence)`: cross-attention will be
+            built into the decoder block. This is useful when building an
+            "encoder-decoder" transformer, such as the original transformer
+            model described in Attention is All You Need.
+    Args:
+        intermediate_dim: int, the hidden size of feedforward network.
+        num_heads: int, the number of heads in MultiHeadAttention.
+        dropout: float. the dropout value, shared by
+            MultiHeadAttention and feedforward network. Defaults to `0.`.
+        activation: string or `keras.activations`. the
+            activation function of feedforward network.
+            Defaults to `"relu"`.
+        layer_norm_epsilon: float. The eps value in layer
+            normalization components. Defaults to `1e-5`.
+        kernel_initializer: string or `keras.initializers` initializer.
+            The kernel initializer for the dense and multiheaded
+            attention layers. Defaults to `"glorot_uniform"`.
+        bias_initializer: string or `keras.initializers` initializer.
+            The bias initializer for the dense and multiheaded
+            attention layers. Defaults to `"zeros"`.
+        normalize_first: bool. If True, the inputs to the
+            attention layer(s) and the intermediate dense layer are normalized
+            (similar to GPT-2). If set to False, outputs of attention layer and
+            intermediate dense layer are normalized (similar to BERT).
+            Defaults to `False`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
+    Example:
+    ```python
+    # Create a single transformer decoder layer.
+    decoder = keras_hub.layers.TransformerDecoder(
+        intermediate_dim=64, num_heads=8)
+    # Create a simple model containing the decoder.
+    decoder_input = keras.Input(shape=(10, 64))
+    encoder_input = keras.Input(shape=(10, 64))
+    output = decoder(decoder_input, encoder_input)
+    model = keras.Model(
+        inputs=(decoder_input, encoder_input),
+        outputs=output,
+    )
+    # Call decoder on the inputs.
+    decoder_input_data = np.random.uniform(size=(2, 10, 64))
+    encoder_input_data = np.random.uniform(size=(2, 10, 64))
+    decoder_output = model((decoder_input_data, encoder_input_data))
+    ```
+    References:
+     - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
+    """
+    def __init__(
+        self,
+        intermediate_dim,
+        num_heads,
+        dropout=0,
+        activation="relu",
+        layer_norm_epsilon=1e-05,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        normalize_first=False,
+        **kwargs,
+    ):
+        # Work around for model saving, we need to ensure our model is built
+        # immediately after restoring from config.
+        decoder_sequence_shape = kwargs.pop("decoder_sequence_shape", None)
+        encoder_sequence_shape = kwargs.pop("encoder_sequence_shape", None)
+        super().__init__(**kwargs)
+        self.intermediate_dim = intermediate_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.activation = keras.activations.get(activation)
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.kernel_initializer = keras.initializers.get(kernel_initializer)
+        self.bias_initializer = keras.initializers.get(bias_initializer)
+        self.normalize_first = normalize_first
+        self.supports_masking = True
+        self._decoder_sequence_shape = None
+        self._encoder_sequence_shape = None
+        if decoder_sequence_shape:
+            self.build(decoder_sequence_shape, encoder_sequence_shape)
+    def build(
+        self,
+        decoder_sequence_shape,
+        encoder_sequence_shape=None,
+    ):
+        self._decoder_sequence_shape = decoder_sequence_shape
+        self._encoder_sequence_shape = encoder_sequence_shape
+        # Infer the dimension of our hidden feature size from the build shape.
+        hidden_dim = decoder_sequence_shape[-1]
+        # Attention head size is `hidden_dim` over the number of heads.
+        head_dim = int(hidden_dim // self.num_heads)
+        if head_dim == 0:
+            raise ValueError(
+                "Attention `head_dim` computed cannot be zero. "
+                f"The `hidden_dim` value of {hidden_dim} has to be equal to "
+                f"or greater than `num_heads` value of {self.num_heads}."
+            )
+        # Self attention layers.
+        self._self_attention_layer = CachedMultiHeadAttention(
+            num_heads=self.num_heads,
+            key_dim=head_dim,
+            dropout=self.dropout,
+            kernel_initializer=clone_initializer(self.kernel_initializer),
+            bias_initializer=clone_initializer(self.bias_initializer),
+            dtype=self.dtype_policy,
+            name="self_attention",
+        )
+        if hasattr(self._self_attention_layer, "_build_from_signature"):
+            self._self_attention_layer._build_from_signature(
+                query=decoder_sequence_shape,
+                value=decoder_sequence_shape,
+            )
+        else:
+            self._self_attention_layer.build(
+                query_shape=decoder_sequence_shape,
+                value_shape=decoder_sequence_shape,
+            )
+        self._self_attention_layer_norm = keras.layers.LayerNormalization(
+            epsilon=self.layer_norm_epsilon,
+            dtype=self.dtype_policy,
+            name="self_attention_layer_norm",
+        )
+        self._self_attention_layer_norm.build(decoder_sequence_shape)
+        self._self_attention_dropout = keras.layers.Dropout(
+            rate=self.dropout,
+            dtype=self.dtype_policy,
+            name="self_attention_dropout",
+        )
+        # Cross attention layers are optional.
+        self._cross_attention_layer = None
+        if encoder_sequence_shape:
+            self._cross_attention_layer = CachedMultiHeadAttention(
+                num_heads=self.num_heads,
+                key_dim=head_dim,
+                value_dim=head_dim,
+                dropout=self.dropout,
+                kernel_initializer=clone_initializer(self.kernel_initializer),
+                bias_initializer=clone_initializer(self.bias_initializer),
+                dtype=self.dtype_policy,
+                name="cross_attention",
+            )
+            if hasattr(self._cross_attention_layer, "_build_from_signature"):
+                self._cross_attention_layer._build_from_signature(
+                    query=decoder_sequence_shape,
+                    value=encoder_sequence_shape,
+                )
+            else:
+                self._cross_attention_layer.build(
+                    query_shape=decoder_sequence_shape,
+                    value_shape=encoder_sequence_shape,
+                )
+            self._cross_attention_layer_norm = keras.layers.LayerNormalization(
+                epsilon=self.layer_norm_epsilon,
+                dtype=self.dtype_policy,
+                name="cross_attention_layer_norm",
+            )
+            self._cross_attention_layer_norm.build(decoder_sequence_shape)
+            self._cross_attention_dropout = keras.layers.Dropout(
+                rate=self.dropout,
+                dtype=self.dtype_policy,
+                name="cross_attention_dropout",
+            )
+        # Feedforward layers.
+        self._feedforward_intermediate_dense = keras.layers.Dense(
+            self.intermediate_dim,
+            activation=self.activation,
+            kernel_initializer=clone_initializer(self.kernel_initializer),
+            bias_initializer=clone_initializer(self.bias_initializer),
+            dtype=self.dtype_policy,
+            name="feedforward_intermediate_dense",
+        )
+        self._feedforward_intermediate_dense.build(decoder_sequence_shape)
+        self._feedforward_output_dense = keras.layers.Dense(
+            hidden_dim,
+            kernel_initializer=clone_initializer(self.kernel_initializer),
+            bias_initializer=clone_initializer(self.bias_initializer),
+            dtype=self.dtype_policy,
+            name="feedforward_output_dense",
+        )
+        intermediate_shape = list(decoder_sequence_shape)
+        intermediate_shape[-1] = self.intermediate_dim
+        self._feedforward_output_dense.build(tuple(intermediate_shape))
+        self._feedforward_layer_norm = keras.layers.LayerNormalization(
+            epsilon=self.layer_norm_epsilon,
+            dtype=self.dtype_policy,
+            name="feedforward_layer_norm",
+        )
+        self._feedforward_layer_norm.build(decoder_sequence_shape)
+        self._feedforward_dropout = keras.layers.Dropout(
+            rate=self.dropout,
+            dtype=self.dtype_policy,
+            name="feedforward_dropout",
+        )
+        # Create layers based on input shape.
+        self.built = True
+    def call(
+        self,
+        decoder_sequence,
+        encoder_sequence=None,
+        decoder_padding_mask=None,
+        decoder_attention_mask=None,
+        encoder_padding_mask=None,
+        encoder_attention_mask=None,
+        self_attention_cache=None,
+        self_attention_cache_update_index=None,
+        cross_attention_cache=None,
+        cross_attention_cache_update_index=None,
+        use_causal_mask=True,
+        training=None,
+    ):
+        """Forward pass of the TransformerDecoder.
+        Args:
+            decoder_sequence: a Tensor. The decoder input sequence.
+            encoder_sequence: a Tensor. The encoder input sequence. For decoder
+                only models (like GPT2), this should be left `None`. Once the
+                model is called once without an encoder_sequence, you cannot
+                call it again with encoder_sequence.
+            decoder_padding_mask: a boolean Tensor, the padding mask of decoder
+                sequence, must be of shape
+                `[batch_size, decoder_sequence_length]`.
+            decoder_attention_mask: a boolean Tensor. Customized decoder
+                sequence mask, must be of shape
+                `[batch_size, decoder_sequence_length, decoder_sequence_length]`.
+            encoder_padding_mask: a boolean Tensor, the padding mask of encoder
+                sequence, must be of shape
+                `[batch_size, encoder_sequence_length]`.
+            encoder_attention_mask: a boolean Tensor. Customized encoder
+                sequence mask, must be of shape
+                `[batch_size, encoder_sequence_length, encoder_sequence_length]`.
+            self_attention_cache: a dense float Tensor. The cache of key/values
+                pairs in the self-attention layer. Has shape
+                `[batch_size, 2, max_seq_len, num_heads, key_dims]`.
+            self_attention_cache_update_index: an int or int Tensor, the index
+                at which to update the `self_attention_cache`. Usually, this is
+                the index of the current token being processed during decoding.
+            cross_attention_cache: a dense float Tensor. The cache of
+                key/value pairs in the cross-attention layer. Has shape
+                `[batch_size, 2, S, num_heads, key_dims]`.
+            cross_attention_cache_update_index:  an int or int Tensor, the index
+                at which to update the `cross_attention_cache`. Usually, this is
+                either `0` (compute the entire `cross_attention_cache`), or
+                `None` (reuse a previously computed `cross_attention_cache`).
+            use_causal_mask: bool, defaults to `True`. If true, a causal mask
+                (masking out future input) is applied `on the decoder sequence.
+            training: a boolean indicating whether the layer should behave in
+                training mode or in inference mode.
+        Returns:
+            One of three things, depending on call arguments:
+            - `outputs`, if `self_attention_cache` is `None.
+            - `(outputs, self_attention_cache)`, if `self_attention_cache` is
+              set and the layer has no cross-attention.
+            - `(outputs, self_attention_cache, cross_attention_cache)`, if
+              `self_attention_cache` and `cross_attention_cache` are set and
+              the layer has cross-attention.
+        """
+        has_encoder_sequence = encoder_sequence is not None
+        has_cross_attention = self._cross_attention_layer is not None
+        if not has_cross_attention and has_encoder_sequence:
+            raise ValueError(
+                "The number of call arguments to "
+                "`keras_hub.layers.TransformerDecoder` should not change. "
+                "Use `layer(decoder_sequence, encoder_sequence)` to "
+                "build a layer with cross attention, or "
+                "`layer(decoder_sequence)` to build a layer without. "
+                "This layer has been built without cross attention, but "
+                "you are trying to call it with encoder_sequence."
+            )
+        elif has_cross_attention and not has_encoder_sequence:
+            raise ValueError(
+                "The number of call arguments to "
+                "`keras_hub.layers.TransformerDecoder` should not change. "
+                "Use `layer(decoder_sequence, encoder_sequence)` to "
+                "build a layer with cross attention, or "
+                "`layer(decoder_sequence)` to build a layer without. "
+                "This layer has been built with cross attention, but "
+                "you did not provide encoder_sequence."
+            )
+        has_self_attention_cache = self_attention_cache is not None
+        has_cross_attention_cache = cross_attention_cache is not None
+        if has_cross_attention and (
+            has_self_attention_cache != has_cross_attention_cache
+        ):
+            raise ValueError(
+                "When calling `keras_hub.layers.TransformerDecoder` with "
+                "cross-attention (with both `encoder_sequence` and "
+                "`decoder_sequence`), `self_attention_cache` and "
+                "`cross_attention_cache` should both be set or both be `None`. "
+                "One cannot be `None` while the other is not. Received: "
+                f"self_attention_cache={self_attention_cache}, "
+                f"cross_attention_cache={cross_attention_cache}."
+            )
+        self_attention_mask = self._compute_self_attention_mask(
+            decoder_sequence=decoder_sequence,
+            decoder_padding_mask=decoder_padding_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            use_causal_mask=use_causal_mask,
+            self_attention_cache=self_attention_cache,
+            self_attention_cache_update_index=self_attention_cache_update_index,
+        )
+        x = decoder_sequence  # Intermediate result.
+        # Self attention block.
+        residual = x
+        if self.normalize_first:
+            x = self._self_attention_layer_norm(x)
+        attention_output = self._self_attention_layer(
+            query=x,
+            value=x,
+            attention_mask=self_attention_mask,
+            cache=self_attention_cache,
+            cache_update_index=self_attention_cache_update_index,
+            training=training,
+        )
+        if self_attention_cache is None:
+            x = attention_output
+        else:
+            x, self_attention_cache = attention_output
+        x = self._self_attention_dropout(x, training=training)
+        x = x + residual
+        if not self.normalize_first:
+            x = self._self_attention_layer_norm(x)
+        # Cross attention is optional.
+        if has_cross_attention:
+            # Compute cross attention mask.
+            cross_attention_mask = merge_padding_and_attention_mask(
+                encoder_sequence, encoder_padding_mask, encoder_attention_mask
+            )
+            # Cross attention block.
+            residual = x
+            if self.normalize_first:
+                x = self._cross_attention_layer_norm(x)
+            attention_output = self._cross_attention_layer(
+                query=x,
+                value=encoder_sequence,
+                attention_mask=cross_attention_mask,
+                cache=cross_attention_cache,
+                cache_update_index=cross_attention_cache_update_index,
+                training=training,
+            )
+            if cross_attention_cache is None:
+                x = attention_output
+            else:
+                x, cross_attention_cache = attention_output
+            x = self._cross_attention_dropout(x, training=training)
+            x = x + residual
+            if not self.normalize_first:
+                x = self._cross_attention_layer_norm(x)
+        # Feedforward block.
+        residual = x
+        if self.normalize_first:
+            x = self._feedforward_layer_norm(x)
+        x = self._feedforward_intermediate_dense(x)
+        x = self._feedforward_output_dense(x)
+        x = self._feedforward_dropout(x, training=training)
+        x = x + residual
+        if not self.normalize_first:
+            x = self._feedforward_layer_norm(x)
+        if self_attention_cache is not None:
+            if has_cross_attention:
+                return (x, self_attention_cache, cross_attention_cache)
+            else:
+                return (x, self_attention_cache)
+        else:
+            return x
+    def _compute_self_attention_mask(
+        self,
+        decoder_sequence,
+        decoder_padding_mask,
+        decoder_attention_mask,
+        use_causal_mask,
+        self_attention_cache,
+        self_attention_cache_update_index,
+    ):
+        decoder_mask = merge_padding_and_attention_mask(
+            decoder_sequence, decoder_padding_mask, decoder_attention_mask
+        )
+        if use_causal_mask:
+            batch_size = ops.shape(decoder_sequence)[0]
+            input_length = output_length = ops.shape(decoder_sequence)[1]
+            # We need to handle a rectangular causal mask when doing cached
+            # decoding. For generative inference, `decoder_sequence` will
+            # generally be length 1, and `cache` will be the full generation length.
+            if self_attention_cache is not None:
+                input_length = ops.shape(self_attention_cache)[2]
+            causal_mask = compute_causal_mask(
+                batch_size,
+                input_length,
+                output_length,
+                (
+                    0
+                    if self_attention_cache_update_index is None
+                    else self_attention_cache_update_index
+                ),
+            )
+            return (
+                ops.minimum(decoder_mask, causal_mask)
+                if decoder_mask is not None
+                else causal_mask
+            )
+        return decoder_mask
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "intermediate_dim": self.intermediate_dim,
+                "num_heads": self.num_heads,
+                "dropout": self.dropout,
+                "activation": keras.activations.serialize(self.activation),
+                "layer_norm_epsilon": self.layer_norm_epsilon,
+                "kernel_initializer": keras.initializers.serialize(
+                    self.kernel_initializer
+                ),
+                "bias_initializer": keras.initializers.serialize(
+                    self.bias_initializer
+                ),
+                "normalize_first": self.normalize_first,
+                "decoder_sequence_shape": self._decoder_sequence_shape,
+                "encoder_sequence_shape": self._encoder_sequence_shape,
+            }
+        )
+        return config
+    def compute_output_shape(self, decoder_sequence_shape):
+        return decoder_sequence_shape