PyPI - keras-hub-nightly - Versions diffs - 0.15.0.dev20240823171555__py3-none-any.whl - Mend

keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (297) hide show

keras_hub/__init__.py +52 -0
keras_hub/api/__init__.py +27 -0
keras_hub/api/layers/__init__.py +47 -0
keras_hub/api/metrics/__init__.py +24 -0
keras_hub/api/models/__init__.py +249 -0
keras_hub/api/samplers/__init__.py +29 -0
keras_hub/api/tokenizers/__init__.py +35 -0
keras_hub/src/__init__.py +13 -0
keras_hub/src/api_export.py +53 -0
keras_hub/src/layers/__init__.py +13 -0
keras_hub/src/layers/modeling/__init__.py +13 -0
keras_hub/src/layers/modeling/alibi_bias.py +143 -0
keras_hub/src/layers/modeling/cached_multi_head_attention.py +137 -0
keras_hub/src/layers/modeling/f_net_encoder.py +200 -0
keras_hub/src/layers/modeling/masked_lm_head.py +239 -0
keras_hub/src/layers/modeling/position_embedding.py +123 -0
keras_hub/src/layers/modeling/reversible_embedding.py +311 -0
keras_hub/src/layers/modeling/rotary_embedding.py +169 -0
keras_hub/src/layers/modeling/sine_position_encoding.py +108 -0
keras_hub/src/layers/modeling/token_and_position_embedding.py +150 -0
keras_hub/src/layers/modeling/transformer_decoder.py +496 -0
keras_hub/src/layers/modeling/transformer_encoder.py +262 -0
keras_hub/src/layers/modeling/transformer_layer_utils.py +106 -0
keras_hub/src/layers/preprocessing/__init__.py +13 -0
keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +220 -0
keras_hub/src/layers/preprocessing/multi_segment_packer.py +319 -0
keras_hub/src/layers/preprocessing/preprocessing_layer.py +62 -0
keras_hub/src/layers/preprocessing/random_deletion.py +271 -0
keras_hub/src/layers/preprocessing/random_swap.py +267 -0
keras_hub/src/layers/preprocessing/start_end_packer.py +219 -0
keras_hub/src/metrics/__init__.py +13 -0
keras_hub/src/metrics/bleu.py +394 -0
keras_hub/src/metrics/edit_distance.py +197 -0
keras_hub/src/metrics/perplexity.py +181 -0
keras_hub/src/metrics/rouge_base.py +204 -0
keras_hub/src/metrics/rouge_l.py +97 -0
keras_hub/src/metrics/rouge_n.py +125 -0
keras_hub/src/models/__init__.py +13 -0
keras_hub/src/models/albert/__init__.py +20 -0
keras_hub/src/models/albert/albert_backbone.py +267 -0
keras_hub/src/models/albert/albert_classifier.py +202 -0
keras_hub/src/models/albert/albert_masked_lm.py +129 -0
keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/albert/albert_preprocessor.py +206 -0
keras_hub/src/models/albert/albert_presets.py +70 -0
keras_hub/src/models/albert/albert_tokenizer.py +119 -0
keras_hub/src/models/backbone.py +311 -0
keras_hub/src/models/bart/__init__.py +20 -0
keras_hub/src/models/bart/bart_backbone.py +261 -0
keras_hub/src/models/bart/bart_preprocessor.py +276 -0
keras_hub/src/models/bart/bart_presets.py +74 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm.py +490 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +262 -0
keras_hub/src/models/bart/bart_tokenizer.py +124 -0
keras_hub/src/models/bert/__init__.py +23 -0
keras_hub/src/models/bert/bert_backbone.py +227 -0
keras_hub/src/models/bert/bert_classifier.py +183 -0
keras_hub/src/models/bert/bert_masked_lm.py +131 -0
keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/bert/bert_preprocessor.py +184 -0
keras_hub/src/models/bert/bert_presets.py +147 -0
keras_hub/src/models/bert/bert_tokenizer.py +112 -0
keras_hub/src/models/bloom/__init__.py +20 -0
keras_hub/src/models/bloom/bloom_attention.py +186 -0
keras_hub/src/models/bloom/bloom_backbone.py +173 -0
keras_hub/src/models/bloom/bloom_causal_lm.py +298 -0
keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +176 -0
keras_hub/src/models/bloom/bloom_decoder.py +206 -0
keras_hub/src/models/bloom/bloom_preprocessor.py +185 -0
keras_hub/src/models/bloom/bloom_presets.py +121 -0
keras_hub/src/models/bloom/bloom_tokenizer.py +116 -0
keras_hub/src/models/causal_lm.py +383 -0
keras_hub/src/models/classifier.py +109 -0
keras_hub/src/models/csp_darknet/__init__.py +13 -0
keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +410 -0
keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +133 -0
keras_hub/src/models/deberta_v3/__init__.py +24 -0
keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +210 -0
keras_hub/src/models/deberta_v3/deberta_v3_classifier.py +228 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm.py +135 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +191 -0
keras_hub/src/models/deberta_v3/deberta_v3_preprocessor.py +206 -0
keras_hub/src/models/deberta_v3/deberta_v3_presets.py +82 -0
keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +155 -0
keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +227 -0
keras_hub/src/models/deberta_v3/disentangled_self_attention.py +412 -0
keras_hub/src/models/deberta_v3/relative_embedding.py +94 -0
keras_hub/src/models/densenet/__init__.py +13 -0
keras_hub/src/models/densenet/densenet_backbone.py +210 -0
keras_hub/src/models/densenet/densenet_image_classifier.py +131 -0
keras_hub/src/models/distil_bert/__init__.py +26 -0
keras_hub/src/models/distil_bert/distil_bert_backbone.py +187 -0
keras_hub/src/models/distil_bert/distil_bert_classifier.py +208 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +137 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/distil_bert/distil_bert_preprocessor.py +175 -0
keras_hub/src/models/distil_bert/distil_bert_presets.py +57 -0
keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +114 -0
keras_hub/src/models/electra/__init__.py +20 -0
keras_hub/src/models/electra/electra_backbone.py +247 -0
keras_hub/src/models/electra/electra_preprocessor.py +154 -0
keras_hub/src/models/electra/electra_presets.py +95 -0
keras_hub/src/models/electra/electra_tokenizer.py +104 -0
keras_hub/src/models/f_net/__init__.py +20 -0
keras_hub/src/models/f_net/f_net_backbone.py +236 -0
keras_hub/src/models/f_net/f_net_classifier.py +154 -0
keras_hub/src/models/f_net/f_net_masked_lm.py +132 -0
keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +196 -0
keras_hub/src/models/f_net/f_net_preprocessor.py +177 -0
keras_hub/src/models/f_net/f_net_presets.py +43 -0
keras_hub/src/models/f_net/f_net_tokenizer.py +95 -0
keras_hub/src/models/falcon/__init__.py +20 -0
keras_hub/src/models/falcon/falcon_attention.py +156 -0
keras_hub/src/models/falcon/falcon_backbone.py +164 -0
keras_hub/src/models/falcon/falcon_causal_lm.py +291 -0
keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/falcon/falcon_preprocessor.py +187 -0
keras_hub/src/models/falcon/falcon_presets.py +30 -0
keras_hub/src/models/falcon/falcon_tokenizer.py +110 -0
keras_hub/src/models/falcon/falcon_transformer_decoder.py +255 -0
keras_hub/src/models/feature_pyramid_backbone.py +73 -0
keras_hub/src/models/gemma/__init__.py +20 -0
keras_hub/src/models/gemma/gemma_attention.py +250 -0
keras_hub/src/models/gemma/gemma_backbone.py +316 -0
keras_hub/src/models/gemma/gemma_causal_lm.py +448 -0
keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +167 -0
keras_hub/src/models/gemma/gemma_decoder_block.py +241 -0
keras_hub/src/models/gemma/gemma_preprocessor.py +191 -0
keras_hub/src/models/gemma/gemma_presets.py +248 -0
keras_hub/src/models/gemma/gemma_tokenizer.py +103 -0
keras_hub/src/models/gemma/rms_normalization.py +40 -0
keras_hub/src/models/gpt2/__init__.py +20 -0
keras_hub/src/models/gpt2/gpt2_backbone.py +199 -0
keras_hub/src/models/gpt2/gpt2_causal_lm.py +437 -0
keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/gpt2/gpt2_preprocessor.py +187 -0
keras_hub/src/models/gpt2/gpt2_presets.py +82 -0
keras_hub/src/models/gpt2/gpt2_tokenizer.py +110 -0
keras_hub/src/models/gpt_neo_x/__init__.py +13 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +251 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +175 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +201 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +141 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +258 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +145 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +88 -0
keras_hub/src/models/image_classifier.py +90 -0
keras_hub/src/models/llama/__init__.py +20 -0
keras_hub/src/models/llama/llama_attention.py +225 -0
keras_hub/src/models/llama/llama_backbone.py +188 -0
keras_hub/src/models/llama/llama_causal_lm.py +327 -0
keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +170 -0
keras_hub/src/models/llama/llama_decoder.py +246 -0
keras_hub/src/models/llama/llama_layernorm.py +48 -0
keras_hub/src/models/llama/llama_preprocessor.py +189 -0
keras_hub/src/models/llama/llama_presets.py +80 -0
keras_hub/src/models/llama/llama_tokenizer.py +84 -0
keras_hub/src/models/llama3/__init__.py +20 -0
keras_hub/src/models/llama3/llama3_backbone.py +84 -0
keras_hub/src/models/llama3/llama3_causal_lm.py +46 -0
keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/llama3/llama3_preprocessor.py +21 -0
keras_hub/src/models/llama3/llama3_presets.py +69 -0
keras_hub/src/models/llama3/llama3_tokenizer.py +63 -0
keras_hub/src/models/masked_lm.py +101 -0
keras_hub/src/models/mistral/__init__.py +20 -0
keras_hub/src/models/mistral/mistral_attention.py +238 -0
keras_hub/src/models/mistral/mistral_backbone.py +203 -0
keras_hub/src/models/mistral/mistral_causal_lm.py +328 -0
keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +175 -0
keras_hub/src/models/mistral/mistral_layer_norm.py +48 -0
keras_hub/src/models/mistral/mistral_preprocessor.py +190 -0
keras_hub/src/models/mistral/mistral_presets.py +48 -0
keras_hub/src/models/mistral/mistral_tokenizer.py +82 -0
keras_hub/src/models/mistral/mistral_transformer_decoder.py +265 -0
keras_hub/src/models/mix_transformer/__init__.py +13 -0
keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +181 -0
keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +133 -0
keras_hub/src/models/mix_transformer/mix_transformer_layers.py +300 -0
keras_hub/src/models/opt/__init__.py +20 -0
keras_hub/src/models/opt/opt_backbone.py +173 -0
keras_hub/src/models/opt/opt_causal_lm.py +301 -0
keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +177 -0
keras_hub/src/models/opt/opt_preprocessor.py +188 -0
keras_hub/src/models/opt/opt_presets.py +72 -0
keras_hub/src/models/opt/opt_tokenizer.py +116 -0
keras_hub/src/models/pali_gemma/__init__.py +23 -0
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +277 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +313 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +147 -0
keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +160 -0
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +78 -0
keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +79 -0
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +566 -0
keras_hub/src/models/phi3/__init__.py +20 -0
keras_hub/src/models/phi3/phi3_attention.py +260 -0
keras_hub/src/models/phi3/phi3_backbone.py +224 -0
keras_hub/src/models/phi3/phi3_causal_lm.py +218 -0
keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/phi3/phi3_decoder.py +260 -0
keras_hub/src/models/phi3/phi3_layernorm.py +48 -0
keras_hub/src/models/phi3/phi3_preprocessor.py +190 -0
keras_hub/src/models/phi3/phi3_presets.py +50 -0
keras_hub/src/models/phi3/phi3_rotary_embedding.py +137 -0
keras_hub/src/models/phi3/phi3_tokenizer.py +94 -0
keras_hub/src/models/preprocessor.py +207 -0
keras_hub/src/models/resnet/__init__.py +13 -0
keras_hub/src/models/resnet/resnet_backbone.py +612 -0
keras_hub/src/models/resnet/resnet_image_classifier.py +136 -0
keras_hub/src/models/roberta/__init__.py +20 -0
keras_hub/src/models/roberta/roberta_backbone.py +184 -0
keras_hub/src/models/roberta/roberta_classifier.py +209 -0
keras_hub/src/models/roberta/roberta_masked_lm.py +136 -0
keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/roberta/roberta_preprocessor.py +192 -0
keras_hub/src/models/roberta/roberta_presets.py +43 -0
keras_hub/src/models/roberta/roberta_tokenizer.py +132 -0
keras_hub/src/models/seq_2_seq_lm.py +54 -0
keras_hub/src/models/t5/__init__.py +20 -0
keras_hub/src/models/t5/t5_backbone.py +261 -0
keras_hub/src/models/t5/t5_layer_norm.py +35 -0
keras_hub/src/models/t5/t5_multi_head_attention.py +324 -0
keras_hub/src/models/t5/t5_presets.py +95 -0
keras_hub/src/models/t5/t5_tokenizer.py +100 -0
keras_hub/src/models/t5/t5_transformer_layer.py +178 -0
keras_hub/src/models/task.py +419 -0
keras_hub/src/models/vgg/__init__.py +13 -0
keras_hub/src/models/vgg/vgg_backbone.py +158 -0
keras_hub/src/models/vgg/vgg_image_classifier.py +124 -0
keras_hub/src/models/vit_det/__init__.py +13 -0
keras_hub/src/models/vit_det/vit_det_backbone.py +204 -0
keras_hub/src/models/vit_det/vit_layers.py +565 -0
keras_hub/src/models/whisper/__init__.py +20 -0
keras_hub/src/models/whisper/whisper_audio_feature_extractor.py +260 -0
keras_hub/src/models/whisper/whisper_backbone.py +305 -0
keras_hub/src/models/whisper/whisper_cached_multi_head_attention.py +153 -0
keras_hub/src/models/whisper/whisper_decoder.py +141 -0
keras_hub/src/models/whisper/whisper_encoder.py +106 -0
keras_hub/src/models/whisper/whisper_preprocessor.py +326 -0
keras_hub/src/models/whisper/whisper_presets.py +148 -0
keras_hub/src/models/whisper/whisper_tokenizer.py +163 -0
keras_hub/src/models/xlm_roberta/__init__.py +26 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_backbone.py +81 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_classifier.py +225 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +141 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +195 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_preprocessor.py +205 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +43 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +191 -0
keras_hub/src/models/xlnet/__init__.py +13 -0
keras_hub/src/models/xlnet/relative_attention.py +459 -0
keras_hub/src/models/xlnet/xlnet_backbone.py +222 -0
keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +133 -0
keras_hub/src/models/xlnet/xlnet_encoder.py +378 -0
keras_hub/src/samplers/__init__.py +13 -0
keras_hub/src/samplers/beam_sampler.py +207 -0
keras_hub/src/samplers/contrastive_sampler.py +231 -0
keras_hub/src/samplers/greedy_sampler.py +50 -0
keras_hub/src/samplers/random_sampler.py +77 -0
keras_hub/src/samplers/sampler.py +237 -0
keras_hub/src/samplers/serialization.py +97 -0
keras_hub/src/samplers/top_k_sampler.py +92 -0
keras_hub/src/samplers/top_p_sampler.py +113 -0
keras_hub/src/tests/__init__.py +13 -0
keras_hub/src/tests/test_case.py +608 -0
keras_hub/src/tokenizers/__init__.py +13 -0
keras_hub/src/tokenizers/byte_pair_tokenizer.py +638 -0
keras_hub/src/tokenizers/byte_tokenizer.py +299 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +267 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +150 -0
keras_hub/src/tokenizers/tokenizer.py +235 -0
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +355 -0
keras_hub/src/tokenizers/word_piece_tokenizer.py +544 -0
keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +176 -0
keras_hub/src/utils/__init__.py +13 -0
keras_hub/src/utils/keras_utils.py +130 -0
keras_hub/src/utils/pipeline_model.py +293 -0
keras_hub/src/utils/preset_utils.py +621 -0
keras_hub/src/utils/python_utils.py +21 -0
keras_hub/src/utils/tensor_utils.py +206 -0
keras_hub/src/utils/timm/__init__.py +13 -0
keras_hub/src/utils/timm/convert.py +37 -0
keras_hub/src/utils/timm/convert_resnet.py +171 -0
keras_hub/src/utils/transformers/__init__.py +13 -0
keras_hub/src/utils/transformers/convert.py +101 -0
keras_hub/src/utils/transformers/convert_bert.py +173 -0
keras_hub/src/utils/transformers/convert_distilbert.py +184 -0
keras_hub/src/utils/transformers/convert_gemma.py +187 -0
keras_hub/src/utils/transformers/convert_gpt2.py +186 -0
keras_hub/src/utils/transformers/convert_llama3.py +136 -0
keras_hub/src/utils/transformers/convert_pali_gemma.py +303 -0
keras_hub/src/utils/transformers/safetensor_utils.py +97 -0
keras_hub/src/version_utils.py +23 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +34 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +297 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/WHEEL +5 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/top_level.txt +1 -0

keras_hub/src/models/bart/bart_seq_2_seq_lm.py ADDED Viewed

@@ -0,0 +1,490 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras import ops
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.bart.bart_backbone import BartBackbone
+from keras_hub.src.models.bart.bart_seq_2_seq_lm_preprocessor import (
+    BartSeq2SeqLMPreprocessor,
+)
+from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
+from keras_hub.src.utils.tensor_utils import any_equal
+@keras_hub_export("keras_hub.models.BartSeq2SeqLM")
+class BartSeq2SeqLM(Seq2SeqLM):
+    """An end-to-end BART model for seq2seq language modeling.
+    A seq2seq language model (LM) is an encoder-decoder model which is used for
+    conditional text generation. The encoder is given a "context" text (fed to
+    the encoder), and the decoder predicts the next token based on both the
+    encoder inputs and the previous tokens. You can finetune `BartSeq2SeqLM` to
+    generate text for any seq2seq task (e.g., translation or summarization).
+    This model has a `generate()` method, which generates text based on
+    encoder inputs and an optional prompt for the decoder. The generation
+    strategy used is controlled by an additional `sampler` argument passed to
+    `compile()`. You can recompile the model with different `keras_hub.samplers`
+    objects to control the generation. By default, `"top_k"` sampling will be
+    used.
+    This model can optionally be configured with a `preprocessor` layer, in
+    which case it will automatically apply preprocessing to string inputs during
+    `fit()`, `predict()`, `evaluate()` and `generate()`. This is done by default
+    when creating the model with `from_preset()`.
+    Disclaimer: Pre-trained models are provided on an "as is" basis, without
+    warranties or conditions of any kind. The underlying model is provided by a
+    third party and subject to a separate license, available
+    [here](https://github.com/facebookresearch/fairseq/).
+    Args:
+        backbone: A `keras_hub.models.BartBackbone` instance.
+        preprocessor: A `keras_hub.models.BartSeq2SeqLMPreprocessor` or `None`.
+            If `None`, this model will not apply preprocessing, and inputs
+            should be preprocessed before calling the model.
+    Examples:
+    Use `generate()` to do text generation, given an input context.
+    ```python
+    bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset("bart_base_en")
+    bart_lm.generate("The quick brown fox", max_length=30)
+    # Generate with batched inputs.
+    bart_lm.generate(["The quick brown fox", "The whale"], max_length=30)
+    ```
+    Compile the `generate()` function with a custom sampler.
+    ```python
+    bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset("bart_base_en")
+    bart_lm.compile(sampler="greedy")
+    bart_lm.generate("The quick brown fox", max_length=30)
+    ```
+    Use `generate()` with encoder inputs and an incomplete decoder input (prompt).
+    ```python
+    bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset("bart_base_en")
+    bart_lm.generate(
+        {
+            "encoder_text": "The quick brown fox",
+            "decoder_text": "The fast"
+        }
+    )
+    ```
+    Use `generate()` without preprocessing.
+    ```python
+    # Preprocessed inputs, with encoder inputs corresponding to
+    # "The quick brown fox", and the decoder inputs to "The fast". Use
+    # `"padding_mask"` to indicate values that should not be overridden.
+    prompt = {
+        "encoder_token_ids": np.array([[0, 133, 2119, 6219, 23602, 2, 1, 1]]),
+        "encoder_padding_mask": np.array(
+            [[True, True, True, True, True, True, False, False]]
+        ),
+        "decoder_token_ids": np.array([[2, 0, 133, 1769, 2, 1, 1]]),
+        "decoder_padding_mask": np.array([[True, True, True, True, False, False]])
+    }
+    bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset(
+        "bart_base_en",
+        preprocessor=None,
+    )
+    bart_lm.generate(prompt)
+    ```
+    Call `fit()` on a single batch.
+    ```python
+    features = {
+        "encoder_text": ["The quick brown fox jumped.", "I forgot my homework."],
+        "decoder_text": ["The fast hazel fox leapt.", "I forgot my assignment."]
+    }
+    bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset("bart_base_en")
+    bart_lm.fit(x=features, batch_size=2)
+    ```
+    Call `fit()` without preprocessing.
+    ```python
+    x = {
+        "encoder_token_ids": np.array([[0, 133, 2119, 2, 1]] * 2),
+        "encoder_padding_mask": np.array([[1, 1, 1, 1, 0]] * 2),
+        "decoder_token_ids": np.array([[2, 0, 133, 1769, 2]] * 2),
+        "decoder_padding_mask": np.array([[1, 1, 1, 1, 1]] * 2),
+    }
+    y = np.array([[0, 133, 1769, 2, 1]] * 2)
+    sw = np.array([[1, 1, 1, 1, 0]] * 2)
+    bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset(
+        "bart_base_en",
+        preprocessor=None,
+    )
+    bart_lm.fit(x=x, y=y, sample_weight=sw, batch_size=2)
+    ```
+    Custom backbone and vocabulary.
+    ```python
+    features = {
+        "encoder_text": [" afternoon sun"],
+        "decoder_text": ["noon sun"],
+    }
+    vocab = {
+        "<s>": 0,
+        "<pad>": 1,
+        "</s>": 2,
+        "Ġafter": 5,
+        "noon": 6,
+        "Ġsun": 7,
+    }
+    merges = ["Ġ a", "Ġ s", "Ġ n", "e r", "n o", "o n", "Ġs u", "Ġa f", "no on"]
+    merges += ["Ġsu n", "Ġaf t", "Ġaft er"]
+    tokenizer = keras_hub.models.BartTokenizer(
+        vocabulary=vocab,
+        merges=merges,
+    )
+    preprocessor = keras_hub.models.BartSeq2SeqLMPreprocessor(
+        tokenizer=tokenizer,
+        encoder_sequence_length=128,
+        decoder_sequence_length=128,
+    )
+    backbone = keras_hub.models.BartBackbone(
+        vocabulary_size=50265,
+        num_layers=6,
+        num_heads=12,
+        hidden_dim=768,
+        intermediate_dim=3072,
+        max_sequence_length=128,
+    )
+    bart_lm = keras_hub.models.BartSeq2SeqLM(
+        backbone=backbone,
+        preprocessor=preprocessor,
+    )
+    bart_lm.fit(x=features, batch_size=2)
+    ```
+    """
+    backbone_cls = BartBackbone
+    preprocessor_cls = BartSeq2SeqLMPreprocessor
+    def __init__(
+        self,
+        backbone,
+        preprocessor=None,
+        **kwargs,
+    ):
+        # === Layers ===
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+        # === Functional Model ===
+        inputs = backbone.input
+        hidden_states = backbone(inputs)["decoder_sequence_output"]
+        outputs = backbone.token_embedding(hidden_states, reverse=True)
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
+    def call_decoder_with_cache(
+        self,
+        encoder_hidden_states,
+        encoder_padding_mask,
+        decoder_token_ids,
+        self_attention_cache=None,
+        self_attention_cache_update_index=None,
+        cross_attention_cache=None,
+        cross_attention_cache_update_index=None,
+    ):
+        """Forward pass with a key/value caches for generative decoding..
+        `call_decoder_with_cache` adds an additional inference-time forward pass
+        for the model for seq2seq text generation. Unlike calling the model
+        directly, this method does two things to optimize text generation:
+        - Allows caching previous key/value tensors in the decoder's
+          self-attention layer to avoid recomputing the outputs of seen tokens.
+        - Allows caching key/value tensors in the decoder's cross-attention
+          layer to avoid recomputing the encoder outputs.
+        Args:
+            encoder_hidden_states: a dense float Tensor of shape
+                `(batch_size, encoder_sequence_length, hidden_dim)`. The
+                sequence of hidden states at the output of the encoder's last
+                layer.
+            encoder_padding_mask: a dense float Tensor of shape
+                `(batch_size, encoder_sequence_length)`. The padding mask for
+                the encoder input.
+            decoder_token_ids: a dense int Tensor of shape
+                `(batch_size, max_length)`. Input token ids to be fed to
+                the decoder.
+            self_attention_cache: a dense float Tensor of shape
+                `(batch_size, num_layers, 2, max_length, num_heads, key_dims)`.
+                The cached key/value tensors of previously seen tokens in the
+                decoder's self-attention layer.
+            self_attention_cache_update_index: an int or int Tensor, the index
+                at which to update the `self_attention_cache`. Usually, this is
+                the index of the current token being processed during decoding.
+            cross_attention_cache: a dense float Tensor of shape
+                `(batch_size, num_layers, 2, encoder_sequence_length, num_heads, key_dims)`.
+                The cached key/value tensors of the encoder outputs in the
+                decoder's cross-attention layer.
+            cross_attention_cache_update_index: an int or int Tensor, the index
+                at which to update the `cross_attention_cache`. Usually, this is
+                either `0` (compute the entire `cross_attention_cache`), or
+                `None` (reuse a previously computed `cross_attention_cache`).
+        Returns:
+            A `(logits, hidden_states, self_attention_cache, cross_attention_cache)`
+            tuple, where `logits` is the language model logits for the input
+            `decoder_token_ids`, `hidden_states` is the final hidden
+            representation of the input tokens, `self_attention_cache` is the
+            key/value cache in the decoder's self-attention layer and
+            `cross_attention_cache` is the key/value cache in the decoder's
+            cross-attention layer.
+        """
+        # Embedding layers.
+        tokens = self.backbone.token_embedding(decoder_token_ids)
+        positions = self.backbone.decoder_position_embedding(
+            tokens,
+            start_index=self_attention_cache_update_index,
+        )
+        # Sum, normalize and apply dropout to embeddings.
+        x = self.backbone.decoder_embeddings_add((tokens, positions))
+        x = self.backbone.decoder_embeddings_layer_norm(x)
+        x = self.backbone.decoder_embeddings_dropout(x)
+        # Every decoder layer has a separate cache for the self-attention layer
+        # and the cross-attention layer. We update all of them separately.
+        self_attention_caches = []
+        cross_attention_caches = []
+        for i, layer in enumerate(self.backbone.decoder_transformer_layers):
+            current_self_attention_cache = self_attention_cache[:, i, ...]
+            current_cross_attention_cache = cross_attention_cache[:, i, ...]
+            (
+                x,
+                next_self_attention_cache,
+                next_cross_attention_cache,
+            ) = layer(
+                decoder_sequence=x,
+                encoder_sequence=encoder_hidden_states,
+                encoder_padding_mask=encoder_padding_mask,
+                self_attention_cache=current_self_attention_cache,
+                self_attention_cache_update_index=self_attention_cache_update_index,
+                cross_attention_cache=current_cross_attention_cache,
+                cross_attention_cache_update_index=cross_attention_cache_update_index,
+            )
+            if self_attention_cache_update_index is not None:
+                self_attention_caches.append(next_self_attention_cache)
+            if cross_attention_cache_update_index is not None:
+                cross_attention_caches.append(next_cross_attention_cache)
+        if self_attention_cache_update_index is not None:
+            self_attention_cache = ops.stack(self_attention_caches, axis=1)
+        if cross_attention_cache_update_index is not None:
+            cross_attention_cache = ops.stack(cross_attention_caches, axis=1)
+        hidden_states = x
+        logits = self.backbone.token_embedding(hidden_states, reverse=True)
+        return (
+            logits,
+            hidden_states,
+            self_attention_cache,
+            cross_attention_cache,
+        )
+    def call_encoder(self, token_ids, padding_mask):
+        """Does a forward pass on the encoder and returns the encoder output."""
+        tokens = self.backbone.token_embedding(token_ids)
+        positions = self.backbone.encoder_position_embedding(tokens)
+        x = self.backbone.decoder_embeddings_add((tokens, positions))
+        x = self.backbone.encoder_embeddings_layer_norm(x)
+        x = self.backbone.encoder_embeddings_dropout(x)
+        for transformer_layer in self.backbone.encoder_transformer_layers:
+            x = transformer_layer(x, padding_mask=padding_mask)
+        return x
+    def _initialize_cache(self, encoder_token_ids, decoder_token_ids):
+        """Initializes empty self-attention cache and cross-attention cache."""
+        batch_size = ops.shape(encoder_token_ids)[0]
+        encoder_max_length = ops.shape(encoder_token_ids)[1]
+        decoder_max_length = ops.shape(decoder_token_ids)[1]
+        num_layers = self.backbone.num_layers
+        num_heads = self.backbone.num_heads
+        head_dim = self.backbone.hidden_dim // self.backbone.num_heads
+        shape = [
+            batch_size,
+            num_layers,
+            2,
+            decoder_max_length,
+            num_heads,
+            head_dim,
+        ]
+        self_attention_cache = ops.zeros(shape, dtype=self.compute_dtype)
+        shape[3] = encoder_max_length
+        cross_attention_cache = ops.zeros(shape, dtype=self.compute_dtype)
+        return (self_attention_cache, cross_attention_cache)
+    def _build_cache(
+        self, encoder_token_ids, encoder_padding_mask, decoder_token_ids
+    ):
+        """Builds the self-attention cache and the cross-attention cache (key/value pairs)."""
+        encoder_hidden_states = self.call_encoder(
+            token_ids=encoder_token_ids, padding_mask=encoder_padding_mask
+        )
+        self_attention_cache, cross_attention_cache = self._initialize_cache(
+            encoder_token_ids, decoder_token_ids
+        )
+        # Seed the self-attention cache and the cross-attention cache.
+        (
+            _,
+            hidden_states,
+            self_attention_cache,
+            cross_attention_cache,
+        ) = self.call_decoder_with_cache(
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_padding_mask=encoder_padding_mask,
+            decoder_token_ids=decoder_token_ids,
+            self_attention_cache=self_attention_cache,
+            self_attention_cache_update_index=0,
+            cross_attention_cache=cross_attention_cache,
+            cross_attention_cache_update_index=0,
+        )
+        return (
+            hidden_states,
+            encoder_hidden_states,
+            self_attention_cache,
+            cross_attention_cache,
+        )
+    def generate_step(
+        self,
+        inputs,
+        stop_token_ids=None,
+    ):
+        """A compilable generation function for a batch of inputs.
+        This function represents the inner, XLA-compilable, generation function
+        for a single batch of inputs. Inputs should have the same structure as
+        model inputs, a dictionary with keys `"encoder_token_ids"`,
+        `"encoder_padding_mask"`, `"decoder_token_ids"` and
+        `"decoder_padding_mask"`.
+        Args:
+            inputs: A dictionary with four keys - `"encoder_token_ids"`,
+                `"encoder_padding_mask"`, `"decoder_token_ids"` and
+                `"decoder_padding_mask"`, with batched tensor values.
+            stop_token_ids: Tuple of id's of end token's to stop on. If all
+                sequences have produced a new stop token, generation
+                will stop.
+        """
+        (
+            encoder_token_ids,
+            encoder_padding_mask,
+            decoder_token_ids,
+            decoder_padding_mask,
+        ) = (
+            inputs["encoder_token_ids"],
+            inputs["encoder_padding_mask"],
+            inputs["decoder_token_ids"],
+            inputs["decoder_padding_mask"],
+        )
+        batch_size = ops.shape(encoder_token_ids)[0]
+        # Create and seed cache with a single forward pass.
+        (
+            hidden_states,
+            encoder_hidden_states,
+            self_attention_cache,
+            cross_attention_cache,
+        ) = self._build_cache(
+            encoder_token_ids, encoder_padding_mask, decoder_token_ids
+        )
+        # Compute the lengths of all user inputted tokens ids.
+        row_lengths = ops.sum(ops.cast(decoder_padding_mask, "int32"), axis=-1)
+        # Start at the first index that has no user inputted id.
+        index = ops.min(row_lengths)
+        def next(prompt, cache, index):
+            # The cache index is the index of our previous token.
+            cache_index = index - 1
+            num_samples = ops.shape(prompt)[0]
+            prompt = ops.slice(prompt, [0, cache_index], [num_samples, 1])
+            def repeat_tensor(x):
+                """Repeats tensors along batch axis to match dim for beam search."""
+                if ops.shape(x)[0] == num_samples:
+                    return x
+                return ops.repeat(x, repeats=num_samples // batch_size, axis=0)
+            logits, hidden_states, cache, _ = self.call_decoder_with_cache(
+                encoder_hidden_states=repeat_tensor(encoder_hidden_states),
+                encoder_padding_mask=repeat_tensor(encoder_padding_mask),
+                decoder_token_ids=prompt,
+                self_attention_cache=cache,
+                self_attention_cache_update_index=cache_index,
+                cross_attention_cache=repeat_tensor(cross_attention_cache),
+                cross_attention_cache_update_index=None,
+            )
+            return (
+                ops.squeeze(logits, axis=1),
+                ops.squeeze(hidden_states, axis=1),
+                cache,
+            )
+        decoder_token_ids = self.sampler(
+            next=next,
+            prompt=decoder_token_ids,
+            cache=self_attention_cache,
+            index=index,
+            mask=decoder_padding_mask,
+            stop_token_ids=stop_token_ids,
+            hidden_states=hidden_states,
+            model=self,
+        )
+        # Compute an output padding mask with the token ids we updated.
+        if stop_token_ids is not None:
+            # Build a mask of `stop_token_ids` locations not in the original
+            # prompt (not in locations where `decoder_padding_mask` is True).
+            end_locations = any_equal(
+                decoder_token_ids,
+                stop_token_ids,
+                ops.logical_not(decoder_padding_mask),
+            )
+            end_locations = ops.cast(end_locations, "int32")
+            # Use cumsum to get ones in all locations after `end_locations`.
+            cumsum = ops.cast(ops.cumsum(end_locations, axis=-1), "int32")
+            overflow = cumsum - end_locations
+            # Our padding mask is the inverse of these overflow locations.
+            decoder_padding_mask = ops.logical_not(ops.cast(overflow, "bool"))
+        else:
+            # Without early stopping, all locations will have been updated.
+            decoder_padding_mask = ops.ones_like(
+                decoder_token_ids, dtype="bool"
+            )
+        return {
+            "decoder_token_ids": decoder_token_ids,
+            "decoder_padding_mask": decoder_padding_mask,
+        }