PyPI - keras-hub-nightly - Versions diffs - 0.16.1.dev202410020340__py3-none-any.whl → 0.19.0.dev202501260345__py3-none-any.whl - Mend

keras-hub-nightly 0.16.1.dev202410020340py3-none-any.whl → 0.19.0.dev202501260345py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (252) hide show

keras_hub/api/layers/__init__.py +21 -3
keras_hub/api/models/__init__.py +71 -12
keras_hub/api/tokenizers/__init__.py +1 -1
keras_hub/src/bounding_box/__init__.py +2 -0
keras_hub/src/bounding_box/converters.py +102 -12
keras_hub/src/layers/modeling/f_net_encoder.py +1 -1
keras_hub/src/layers/modeling/masked_lm_head.py +2 -1
keras_hub/src/layers/modeling/reversible_embedding.py +3 -16
keras_hub/src/layers/modeling/rms_normalization.py +36 -0
keras_hub/src/layers/modeling/rotary_embedding.py +3 -2
keras_hub/src/layers/modeling/token_and_position_embedding.py +1 -1
keras_hub/src/layers/modeling/transformer_decoder.py +8 -6
keras_hub/src/layers/modeling/transformer_encoder.py +29 -7
keras_hub/src/layers/preprocessing/audio_converter.py +3 -7
keras_hub/src/layers/preprocessing/image_converter.py +170 -34
keras_hub/src/metrics/bleu.py +4 -3
keras_hub/src/models/albert/albert_presets.py +4 -12
keras_hub/src/models/albert/albert_text_classifier.py +7 -7
keras_hub/src/models/backbone.py +3 -14
keras_hub/src/models/bart/bart_backbone.py +4 -4
keras_hub/src/models/bart/bart_presets.py +3 -9
keras_hub/src/models/bart/bart_seq_2_seq_lm.py +9 -8
keras_hub/src/models/basnet/__init__.py +5 -0
keras_hub/src/models/basnet/basnet.py +122 -0
keras_hub/src/models/basnet/basnet_backbone.py +366 -0
keras_hub/src/models/basnet/basnet_image_converter.py +8 -0
keras_hub/src/models/basnet/basnet_preprocessor.py +14 -0
keras_hub/src/models/basnet/basnet_presets.py +17 -0
keras_hub/src/models/bert/bert_presets.py +14 -32
keras_hub/src/models/bert/bert_text_classifier.py +3 -3
keras_hub/src/models/bloom/bloom_presets.py +8 -24
keras_hub/src/models/causal_lm.py +56 -12
keras_hub/src/models/clip/__init__.py +5 -0
keras_hub/src/models/clip/clip_backbone.py +286 -0
keras_hub/src/models/clip/clip_encoder_block.py +19 -4
keras_hub/src/models/clip/clip_image_converter.py +8 -0
keras_hub/src/models/clip/clip_presets.py +93 -0
keras_hub/src/models/clip/clip_text_encoder.py +4 -1
keras_hub/src/models/clip/clip_tokenizer.py +18 -3
keras_hub/src/models/clip/clip_vision_embedding.py +101 -0
keras_hub/src/models/clip/clip_vision_encoder.py +159 -0
keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +2 -1
keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +0 -109
keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +1 -1
keras_hub/src/models/deberta_v3/deberta_v3_presets.py +5 -15
keras_hub/src/models/deberta_v3/deberta_v3_text_classifier.py +4 -4
keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +4 -4
keras_hub/src/models/deberta_v3/disentangled_self_attention.py +3 -2
keras_hub/src/models/deberta_v3/relative_embedding.py +1 -1
keras_hub/src/models/deeplab_v3/__init__.py +7 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_backbone.py +200 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_image_converter.py +10 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_image_segmeter_preprocessor.py +16 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_layers.py +215 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_presets.py +17 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_segmenter.py +111 -0
keras_hub/src/models/densenet/densenet_backbone.py +6 -4
keras_hub/src/models/densenet/densenet_image_classifier.py +1 -129
keras_hub/src/models/densenet/densenet_image_converter.py +2 -4
keras_hub/src/models/densenet/densenet_presets.py +9 -15
keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +1 -1
keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +2 -2
keras_hub/src/models/distil_bert/distil_bert_presets.py +5 -10
keras_hub/src/models/distil_bert/distil_bert_text_classifier.py +5 -5
keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +3 -3
keras_hub/src/models/efficientnet/__init__.py +9 -0
keras_hub/src/models/efficientnet/cba.py +141 -0
keras_hub/src/models/efficientnet/efficientnet_backbone.py +160 -61
keras_hub/src/models/efficientnet/efficientnet_image_classifier.py +14 -0
keras_hub/src/models/efficientnet/efficientnet_image_classifier_preprocessor.py +16 -0
keras_hub/src/models/efficientnet/efficientnet_image_converter.py +10 -0
keras_hub/src/models/efficientnet/efficientnet_presets.py +193 -0
keras_hub/src/models/efficientnet/fusedmbconv.py +84 -41
keras_hub/src/models/efficientnet/mbconv.py +53 -22
keras_hub/src/models/electra/electra_backbone.py +2 -2
keras_hub/src/models/electra/electra_presets.py +6 -18
keras_hub/src/models/f_net/f_net_presets.py +2 -6
keras_hub/src/models/f_net/f_net_text_classifier.py +3 -3
keras_hub/src/models/f_net/f_net_text_classifier_preprocessor.py +3 -3
keras_hub/src/models/falcon/falcon_backbone.py +5 -3
keras_hub/src/models/falcon/falcon_causal_lm.py +18 -8
keras_hub/src/models/falcon/falcon_presets.py +1 -3
keras_hub/src/models/falcon/falcon_tokenizer.py +7 -2
keras_hub/src/models/feature_pyramid_backbone.py +1 -1
keras_hub/src/models/flux/__init__.py +5 -0
keras_hub/src/models/flux/flux_layers.py +496 -0
keras_hub/src/models/flux/flux_maths.py +225 -0
keras_hub/src/models/flux/flux_model.py +236 -0
keras_hub/src/models/flux/flux_presets.py +3 -0
keras_hub/src/models/flux/flux_text_to_image.py +146 -0
keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +73 -0
keras_hub/src/models/gemma/gemma_backbone.py +35 -20
keras_hub/src/models/gemma/gemma_causal_lm.py +2 -2
keras_hub/src/models/gemma/gemma_decoder_block.py +3 -1
keras_hub/src/models/gemma/gemma_presets.py +29 -63
keras_hub/src/models/gpt2/gpt2_causal_lm.py +2 -2
keras_hub/src/models/gpt2/gpt2_presets.py +5 -14
keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +2 -1
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +3 -3
keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +2 -1
keras_hub/src/models/image_classifier.py +147 -2
keras_hub/src/models/image_classifier_preprocessor.py +6 -3
keras_hub/src/models/image_object_detector.py +87 -0
keras_hub/src/models/image_object_detector_preprocessor.py +57 -0
keras_hub/src/models/image_segmenter.py +0 -5
keras_hub/src/models/image_segmenter_preprocessor.py +29 -4
keras_hub/src/models/image_to_image.py +417 -0
keras_hub/src/models/inpaint.py +520 -0
keras_hub/src/models/llama/llama_backbone.py +138 -12
keras_hub/src/models/llama/llama_causal_lm.py +3 -1
keras_hub/src/models/llama/llama_presets.py +10 -20
keras_hub/src/models/llama3/llama3_backbone.py +12 -11
keras_hub/src/models/llama3/llama3_causal_lm.py +1 -1
keras_hub/src/models/llama3/llama3_presets.py +4 -12
keras_hub/src/models/llama3/llama3_tokenizer.py +25 -2
keras_hub/src/models/mistral/mistral_backbone.py +16 -15
keras_hub/src/models/mistral/mistral_causal_lm.py +6 -4
keras_hub/src/models/mistral/mistral_presets.py +3 -9
keras_hub/src/models/mistral/mistral_transformer_decoder.py +2 -1
keras_hub/src/models/mit/__init__.py +6 -0
keras_hub/src/models/{mix_transformer/mix_transformer_backbone.py → mit/mit_backbone.py} +47 -36
keras_hub/src/models/mit/mit_image_classifier.py +12 -0
keras_hub/src/models/mit/mit_image_classifier_preprocessor.py +12 -0
keras_hub/src/models/mit/mit_image_converter.py +8 -0
keras_hub/src/models/{mix_transformer/mix_transformer_layers.py → mit/mit_layers.py} +20 -13
keras_hub/src/models/mit/mit_presets.py +139 -0
keras_hub/src/models/mobilenet/mobilenet_backbone.py +8 -8
keras_hub/src/models/mobilenet/mobilenet_image_classifier.py +0 -92
keras_hub/src/models/opt/opt_causal_lm.py +2 -2
keras_hub/src/models/opt/opt_presets.py +4 -12
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +63 -17
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +3 -1
keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +21 -23
keras_hub/src/models/pali_gemma/pali_gemma_image_converter.py +2 -4
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +173 -17
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +14 -26
keras_hub/src/models/phi3/phi3_causal_lm.py +3 -1
keras_hub/src/models/phi3/phi3_decoder.py +0 -1
keras_hub/src/models/phi3/phi3_presets.py +2 -6
keras_hub/src/models/phi3/phi3_rotary_embedding.py +1 -1
keras_hub/src/models/preprocessor.py +25 -11
keras_hub/src/models/resnet/resnet_backbone.py +3 -14
keras_hub/src/models/resnet/resnet_image_classifier.py +0 -137
keras_hub/src/models/resnet/resnet_image_converter.py +2 -4
keras_hub/src/models/resnet/resnet_presets.py +127 -18
keras_hub/src/models/retinanet/__init__.py +5 -0
keras_hub/src/models/retinanet/anchor_generator.py +52 -53
keras_hub/src/models/retinanet/feature_pyramid.py +103 -39
keras_hub/src/models/retinanet/non_max_supression.py +1 -0
keras_hub/src/models/retinanet/prediction_head.py +192 -0
keras_hub/src/models/retinanet/retinanet_backbone.py +146 -0
keras_hub/src/models/retinanet/retinanet_image_converter.py +53 -0
keras_hub/src/models/retinanet/retinanet_label_encoder.py +49 -51
keras_hub/src/models/retinanet/retinanet_object_detector.py +381 -0
keras_hub/src/models/retinanet/retinanet_object_detector_preprocessor.py +14 -0
keras_hub/src/models/retinanet/retinanet_presets.py +16 -0
keras_hub/src/models/roberta/roberta_backbone.py +2 -2
keras_hub/src/models/roberta/roberta_presets.py +6 -8
keras_hub/src/models/roberta/roberta_text_classifier.py +3 -3
keras_hub/src/models/sam/__init__.py +5 -0
keras_hub/src/models/sam/sam_backbone.py +2 -3
keras_hub/src/models/sam/sam_image_converter.py +2 -4
keras_hub/src/models/sam/sam_image_segmenter.py +16 -16
keras_hub/src/models/sam/sam_image_segmenter_preprocessor.py +11 -1
keras_hub/src/models/sam/sam_layers.py +5 -3
keras_hub/src/models/sam/sam_presets.py +3 -9
keras_hub/src/models/sam/sam_prompt_encoder.py +4 -2
keras_hub/src/models/sam/sam_transformer.py +5 -4
keras_hub/src/models/segformer/__init__.py +8 -0
keras_hub/src/models/segformer/segformer_backbone.py +167 -0
keras_hub/src/models/segformer/segformer_image_converter.py +8 -0
keras_hub/src/models/segformer/segformer_image_segmenter.py +184 -0
keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py +31 -0
keras_hub/src/models/segformer/segformer_presets.py +136 -0
keras_hub/src/models/seq_2_seq_lm_preprocessor.py +1 -1
keras_hub/src/models/stable_diffusion_3/flow_match_euler_discrete_scheduler.py +8 -1
keras_hub/src/models/stable_diffusion_3/mmdit.py +577 -190
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +189 -163
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +178 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +193 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py +43 -7
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +25 -14
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +1 -1
keras_hub/src/models/t5/t5_backbone.py +5 -4
keras_hub/src/models/t5/t5_presets.py +47 -19
keras_hub/src/models/task.py +47 -39
keras_hub/src/models/text_classifier.py +2 -2
keras_hub/src/models/text_to_image.py +106 -41
keras_hub/src/models/vae/__init__.py +1 -0
keras_hub/src/models/vae/vae_backbone.py +184 -0
keras_hub/src/models/vae/vae_layers.py +739 -0
keras_hub/src/models/vgg/__init__.py +5 -0
keras_hub/src/models/vgg/vgg_backbone.py +4 -24
keras_hub/src/models/vgg/vgg_image_classifier.py +139 -33
keras_hub/src/models/vgg/vgg_image_classifier_preprocessor.py +12 -0
keras_hub/src/models/vgg/vgg_image_converter.py +8 -0
keras_hub/src/models/vgg/vgg_presets.py +48 -0
keras_hub/src/models/vit/__init__.py +5 -0
keras_hub/src/models/vit/vit_backbone.py +152 -0
keras_hub/src/models/vit/vit_image_classifier.py +187 -0
keras_hub/src/models/vit/vit_image_classifier_preprocessor.py +12 -0
keras_hub/src/models/vit/vit_image_converter.py +73 -0
keras_hub/src/models/vit/vit_layers.py +391 -0
keras_hub/src/models/vit/vit_presets.py +126 -0
keras_hub/src/models/vit_det/vit_det_backbone.py +6 -4
keras_hub/src/models/vit_det/vit_layers.py +3 -3
keras_hub/src/models/whisper/whisper_audio_converter.py +2 -4
keras_hub/src/models/whisper/whisper_backbone.py +6 -5
keras_hub/src/models/whisper/whisper_decoder.py +3 -5
keras_hub/src/models/whisper/whisper_presets.py +10 -30
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +1 -1
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +2 -2
keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +2 -6
keras_hub/src/models/xlm_roberta/xlm_roberta_text_classifier.py +4 -4
keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +2 -1
keras_hub/src/models/xlnet/relative_attention.py +20 -19
keras_hub/src/models/xlnet/xlnet_backbone.py +2 -2
keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +3 -5
keras_hub/src/models/xlnet/xlnet_encoder.py +7 -9
keras_hub/src/samplers/contrastive_sampler.py +2 -3
keras_hub/src/samplers/sampler.py +2 -1
keras_hub/src/tests/test_case.py +41 -6
keras_hub/src/tokenizers/byte_pair_tokenizer.py +7 -3
keras_hub/src/tokenizers/byte_tokenizer.py +3 -10
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +2 -9
keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +9 -11
keras_hub/src/tokenizers/tokenizer.py +10 -13
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +9 -7
keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +10 -3
keras_hub/src/utils/keras_utils.py +2 -13
keras_hub/src/utils/pipeline_model.py +3 -3
keras_hub/src/utils/preset_utils.py +196 -144
keras_hub/src/utils/tensor_utils.py +4 -4
keras_hub/src/utils/timm/convert_densenet.py +6 -4
keras_hub/src/utils/timm/convert_efficientnet.py +447 -0
keras_hub/src/utils/timm/convert_resnet.py +1 -1
keras_hub/src/utils/timm/convert_vgg.py +85 -0
keras_hub/src/utils/timm/preset_loader.py +14 -9
keras_hub/src/utils/transformers/convert_llama3.py +21 -5
keras_hub/src/utils/transformers/convert_vit.py +150 -0
keras_hub/src/utils/transformers/preset_loader.py +23 -0
keras_hub/src/utils/transformers/safetensor_utils.py +4 -3
keras_hub/src/version_utils.py +1 -1
{keras_hub_nightly-0.16.1.dev202410020340.dist-info → keras_hub_nightly-0.19.0.dev202501260345.dist-info}/METADATA +86 -68
keras_hub_nightly-0.19.0.dev202501260345.dist-info/RECORD +423 -0
{keras_hub_nightly-0.16.1.dev202410020340.dist-info → keras_hub_nightly-0.19.0.dev202501260345.dist-info}/WHEEL +1 -1
keras_hub/src/layers/preprocessing/resizing_image_converter.py +0 -138
keras_hub/src/models/mix_transformer/__init__.py +0 -0
keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +0 -119
keras_hub/src/models/stable_diffusion_3/vae_image_decoder.py +0 -320
keras_hub_nightly-0.16.1.dev202410020340.dist-info/RECORD +0 -357
{keras_hub_nightly-0.16.1.dev202410020340.dist-info → keras_hub_nightly-0.19.0.dev202501260345.dist-info}/top_level.txt +0 -0

keras_hub/src/models/stable_diffusion_3/mmdit.py CHANGED Viewed

@@ -2,7 +2,6 @@ import math
 import keras
 from keras import layers
-from keras import models
 from keras import ops
 from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
@@ -11,7 +10,216 @@ from keras_hub.src.utils.keras_utils import gelu_approximate
 from keras_hub.src.utils.keras_utils import standardize_data_format
+class AdaptiveLayerNormalization(layers.Layer):
+    """Adaptive layer normalization.
+    Args:
+        embedding_dim: int. The size of each embedding vector.
+        num_modulations: int. The number of the modulation parameters. The
+            available values are `2`, `6` and `9`. Defaults to `2`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    References:
+    - [FiLM: Visual Reasoning with a General Conditioning Layer](
+    https://arxiv.org/abs/1709.07871).
+    - [Scalable Diffusion Models with Transformers](
+    https://arxiv.org/abs/2212.09748).
+    """
+    def __init__(self, hidden_dim, num_modulations=2, **kwargs):
+        super().__init__(**kwargs)
+        hidden_dim = int(hidden_dim)
+        num_modulations = int(num_modulations)
+        if num_modulations not in (2, 6, 9):
+            raise ValueError(
+                "`num_modulations` must be `2`, `6` or `9`. "
+                f"Received: num_modulations={num_modulations}"
+            )
+        self.hidden_dim = hidden_dim
+        self.num_modulations = num_modulations
+        self.silu = layers.Activation("silu", dtype=self.dtype_policy)
+        self.dense = layers.Dense(
+            num_modulations * hidden_dim, dtype=self.dtype_policy, name="dense"
+        )
+        self.norm = layers.LayerNormalization(
+            epsilon=1e-6,
+            center=False,
+            scale=False,
+            dtype="float32",
+            name="norm",
+        )
+    def build(self, inputs_shape, embeddings_shape):
+        self.silu.build(embeddings_shape)
+        self.dense.build(embeddings_shape)
+        self.norm.build(inputs_shape)
+    def call(self, inputs, embeddings, training=None):
+        hidden_states = inputs
+        emb = self.dense(self.silu(embeddings), training=training)
+        if self.num_modulations == 9:
+            (
+                shift_msa,
+                scale_msa,
+                gate_msa,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+                shift_msa2,
+                scale_msa2,
+                gate_msa2,
+            ) = ops.split(emb, self.num_modulations, axis=1)
+        elif self.num_modulations == 6:
+            (
+                shift_msa,
+                scale_msa,
+                gate_msa,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+            ) = ops.split(emb, self.num_modulations, axis=1)
+        else:
+            shift_msa, scale_msa = ops.split(emb, self.num_modulations, axis=1)
+        scale_msa = ops.expand_dims(scale_msa, axis=1)
+        shift_msa = ops.expand_dims(shift_msa, axis=1)
+        norm_hidden_states = ops.cast(
+            self.norm(hidden_states, training=training), scale_msa.dtype
+        )
+        hidden_states = ops.add(
+            ops.multiply(norm_hidden_states, ops.add(1.0, scale_msa)), shift_msa
+        )
+        if self.num_modulations == 9:
+            scale_msa2 = ops.expand_dims(scale_msa2, axis=1)
+            shift_msa2 = ops.expand_dims(shift_msa2, axis=1)
+            hidden_states2 = ops.add(
+                ops.multiply(norm_hidden_states, ops.add(1.0, scale_msa2)),
+                shift_msa2,
+            )
+            return (
+                hidden_states,
+                gate_msa,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+                hidden_states2,
+                gate_msa2,
+            )
+        elif self.num_modulations == 6:
+            return hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp
+        else:
+            return hidden_states
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "num_modulations": self.num_modulations,
+            }
+        )
+        return config
+    def compute_output_shape(self, inputs_shape, embeddings_shape):
+        if self.num_modulations == 9:
+            return (
+                inputs_shape,
+                embeddings_shape,
+                embeddings_shape,
+                embeddings_shape,
+                embeddings_shape,
+                inputs_shape,
+                embeddings_shape,
+            )
+        elif self.num_modulations == 6:
+            return (
+                inputs_shape,
+                embeddings_shape,
+                embeddings_shape,
+                embeddings_shape,
+                embeddings_shape,
+            )
+        else:
+            return inputs_shape
+class MLP(layers.Layer):
+    """A MLP block with architecture.
+    Args:
+        hidden_dim: int. The number of units in the hidden layers.
+        output_dim: int. The number of units in the output layer.
+        activation: str of callable. Activation to use in the hidden layers.
+            Default to `None`.
+    """
+    def __init__(self, hidden_dim, output_dim, activation=None, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_dim = int(hidden_dim)
+        self.output_dim = int(output_dim)
+        self.activation = keras.activations.get(activation)
+        self.dense1 = layers.Dense(
+            hidden_dim,
+            activation=self.activation,
+            dtype=self.dtype_policy,
+            name="dense1",
+        )
+        self.dense2 = layers.Dense(
+            output_dim,
+            activation=None,
+            dtype=self.dtype_policy,
+            name="dense2",
+        )
+    def build(self, inputs_shape):
+        self.dense1.build(inputs_shape)
+        inputs_shape = self.dense1.compute_output_shape(inputs_shape)
+        self.dense2.build(inputs_shape)
+    def call(self, inputs, training=None):
+        x = self.dense1(inputs, training=training)
+        return self.dense2(x, training=training)
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "output_dim": self.output_dim,
+                "activation": keras.activations.serialize(self.activation),
+            }
+        )
+        return config
+    def compute_output_shape(self, inputs_shape):
+        outputs_shape = list(inputs_shape)
+        outputs_shape[-1] = self.output_dim
+        return outputs_shape
 class PatchEmbedding(layers.Layer):
+    """A layer that converts images into patches.
+    Args:
+        patch_size: int. The size of one side of each patch.
+        hidden_dim: int. The number of units in the hidden layers.
+        data_format: `None` or str. If specified, either `"channels_last"` or
+            `"channels_first"`. The ordering of the dimensions in the
+            inputs. `"channels_last"` corresponds to inputs with shape
+            `(batch_size, height, width, channels)`
+            while `"channels_first"` corresponds to inputs with shape
+            `(batch_size, channels, height, width)`. It defaults to the
+            `image_data_format` value found in your Keras config file at
+            `~/.keras/keras.json`. If you never set it, then it will be
+            `"channels_last"`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
     def __init__(self, patch_size, hidden_dim, data_format=None, **kwargs):
         super().__init__(**kwargs)
         self.patch_size = int(patch_size)
@@ -48,6 +256,15 @@ class PatchEmbedding(layers.Layer):
 class AdjustablePositionEmbedding(PositionEmbedding):
+    """A position embedding layer with adjustable height and width.
+    The embedding will be cropped to match the input dimensions.
+    Args:
+        height: int. The maximum height of the embedding.
+        width: int. The maximum width of the embedding.
+    """
     def __init__(
         self,
         height,
@@ -84,11 +301,36 @@ class AdjustablePositionEmbedding(PositionEmbedding):
         position_embedding = ops.expand_dims(position_embedding, axis=0)
         return position_embedding
+    def get_config(self):
+        config = super().get_config()
+        del config["sequence_length"]
+        config.update(
+            {
+                "height": self.height,
+                "width": self.width,
+            }
+        )
+        return config
     def compute_output_shape(self, input_shape):
         return input_shape
 class TimestepEmbedding(layers.Layer):
+    """A layer which learns embedding for input timesteps.
+    Args:
+        embedding_dim: int. The size of the embedding.
+        frequency_dim: int. The size of the frequency.
+        max_period: int. Controls the maximum frequency of the embeddings.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    Reference:
+    - [Denoising Diffusion Probabilistic Models](
+    https://arxiv.org/abs/2006.11239).
+    """
     def __init__(
         self, embedding_dim, frequency_dim=256, max_period=10000, **kwargs
     ):
@@ -96,17 +338,23 @@ class TimestepEmbedding(layers.Layer):
         self.embedding_dim = int(embedding_dim)
         self.frequency_dim = int(frequency_dim)
         self.max_period = float(max_period)
-        self.half_frequency_dim = self.frequency_dim // 2
-        self.mlp = models.Sequential(
-            [
-                layers.Dense(
-                    embedding_dim, activation="silu", dtype=self.dtype_policy
-                ),
-                layers.Dense(
-                    embedding_dim, activation=None, dtype=self.dtype_policy
+        # Precomputed `freq`.
+        half_frequency_dim = frequency_dim // 2
+        self.freq = ops.exp(
+            ops.divide(
+                ops.multiply(
+                    -math.log(max_period),
+                    ops.arange(0, half_frequency_dim, dtype="float32"),
                 ),
-            ],
+                half_frequency_dim,
+            )
+        )
+        self.mlp = MLP(
+            embedding_dim,
+            embedding_dim,
+            "silu",
+            dtype=self.dtype_policy,
             name="mlp",
         )
@@ -118,16 +366,7 @@ class TimestepEmbedding(layers.Layer):
     def _create_timestep_embedding(self, inputs):
         compute_dtype = keras.backend.result_type(self.compute_dtype, "float32")
         x = ops.cast(inputs, compute_dtype)
-        freqs = ops.exp(
-            ops.divide(
-                ops.multiply(
-                    -math.log(self.max_period),
-                    ops.arange(0, self.half_frequency_dim, dtype="float32"),
-                ),
-                self.half_frequency_dim,
-            )
-        )
-        freqs = ops.cast(freqs, compute_dtype)
+        freqs = ops.cast(self.freq, compute_dtype)
         x = ops.multiply(x, ops.expand_dims(freqs, axis=0))
         embedding = ops.concatenate([ops.cos(x), ops.sin(x)], axis=-1)
         if self.frequency_dim % 2 != 0:
@@ -143,6 +382,7 @@ class TimestepEmbedding(layers.Layer):
         config.update(
             {
                 "embedding_dim": self.embedding_dim,
+                "frequency_dim": self.frequency_dim,
                 "max_period": self.max_period,
             }
         )
@@ -154,13 +394,52 @@ class TimestepEmbedding(layers.Layer):
         return output_shape
+def get_qk_norm(qk_norm=None, q_norm_name="q_norm", k_norm_name="k_norm"):
+    """Helper function to instantiate `LayerNormalization` layers."""
+    q_norm = None
+    k_norm = None
+    if qk_norm is None:
+        pass
+    elif qk_norm == "rms_norm":
+        q_norm = layers.LayerNormalization(
+            epsilon=1e-6, rms_scaling=True, dtype="float32", name=q_norm_name
+        )
+        k_norm = layers.LayerNormalization(
+            epsilon=1e-6, rms_scaling=True, dtype="float32", name=k_norm_name
+        )
+    else:
+        raise NotImplementedError(
+            "Supported `qk_norm` are `'rms_norm'` and `None`. "
+            f"Received: qk_norm={qk_norm}."
+        )
+    return q_norm, k_norm
 class DismantledBlock(layers.Layer):
+    """A dismantled block used to compute pre- and post-attention.
+    Args:
+        num_heads: int. Number of attention heads.
+        hidden_dim: int. The number of units in the hidden layers.
+        mlp_ratio: float. The expansion ratio of `MLP`.
+        use_projection: bool. Whether to use an attention projection layer at
+            the end of the block.
+        qk_norm: Optional str. Whether to normalize the query and key tensors.
+            Available options are `None` and `"rms_norm"`. Defaults to `None`.
+        use_dual_attention: bool. Whether to use a dual attention in the
+            block. Defaults to `False`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
     def __init__(
         self,
         num_heads,
         hidden_dim,
         mlp_ratio=4.0,
         use_projection=True,
+        qk_norm=None,
+        use_dual_attention=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -168,33 +447,32 @@ class DismantledBlock(layers.Layer):
         self.hidden_dim = hidden_dim
         self.mlp_ratio = mlp_ratio
         self.use_projection = use_projection
+        self.qk_norm = qk_norm
+        self.use_dual_attention = use_dual_attention
         head_dim = hidden_dim // num_heads
         self.head_dim = head_dim
         mlp_hidden_dim = int(hidden_dim * mlp_ratio)
         self.mlp_hidden_dim = mlp_hidden_dim
-        num_modulations = 6 if use_projection else 2
-        self.num_modulations = num_modulations
-        self.adaptive_norm_modulation = models.Sequential(
-            [
-                layers.Activation("silu", dtype=self.dtype_policy),
-                layers.Dense(
-                    num_modulations * hidden_dim, dtype=self.dtype_policy
-                ),
-            ],
-            name="adaptive_norm_modulation",
-        )
-        self.norm1 = layers.LayerNormalization(
-            epsilon=1e-6,
-            center=False,
-            scale=False,
-            dtype="float32",
-            name="norm1",
-        )
+        if use_projection:
+            self.ada_layer_norm = AdaptiveLayerNormalization(
+                hidden_dim,
+                num_modulations=9 if use_dual_attention else 6,
+                dtype=self.dtype_policy,
+                name="ada_layer_norm",
+            )
+        else:
+            self.ada_layer_norm = AdaptiveLayerNormalization(
+                hidden_dim, dtype=self.dtype_policy, name="ada_layer_norm"
+            )
         self.attention_qkv = layers.Dense(
             hidden_dim * 3, dtype=self.dtype_policy, name="attention_qkv"
         )
+        q_norm, k_norm = get_qk_norm(qk_norm)
+        if q_norm is not None:
+            self.q_norm = q_norm
+            self.k_norm = k_norm
         if use_projection:
             self.attention_proj = layers.Dense(
                 hidden_dim, dtype=self.dtype_policy, name="attention_proj"
@@ -206,89 +484,165 @@ class DismantledBlock(layers.Layer):
                 dtype="float32",
                 name="norm2",
             )
-            self.mlp = models.Sequential(
-                [
-                    layers.Dense(
-                        mlp_hidden_dim,
-                        activation=gelu_approximate,
-                        dtype=self.dtype_policy,
-                    ),
-                    layers.Dense(
-                        hidden_dim,
-                        dtype=self.dtype_policy,
-                    ),
-                ],
+            self.mlp = MLP(
+                mlp_hidden_dim,
+                hidden_dim,
+                gelu_approximate,
+                dtype=self.dtype_policy,
                 name="mlp",
             )
+        if use_dual_attention:
+            self.attention_qkv2 = layers.Dense(
+                hidden_dim * 3, dtype=self.dtype_policy, name="attention_qkv2"
+            )
+            q_norm2, k_norm2 = get_qk_norm(qk_norm, "q_norm2", "k_norm2")
+            if q_norm is not None:
+                self.q_norm2 = q_norm2
+                self.k_norm2 = k_norm2
+            if use_projection:
+                self.attention_proj2 = layers.Dense(
+                    hidden_dim, dtype=self.dtype_policy, name="attention_proj2"
+                )
     def build(self, inputs_shape, timestep_embedding):
-        self.adaptive_norm_modulation.build(timestep_embedding)
+        self.ada_layer_norm.build(inputs_shape, timestep_embedding)
         self.attention_qkv.build(inputs_shape)
-        self.norm1.build(inputs_shape)
+        if self.qk_norm is not None:
+            # [batch_size, sequence_length, num_heads, head_dim]
+            self.q_norm.build([None, None, self.num_heads, self.head_dim])
+            self.k_norm.build([None, None, self.num_heads, self.head_dim])
         if self.use_projection:
             self.attention_proj.build(inputs_shape)
             self.norm2.build(inputs_shape)
             self.mlp.build(inputs_shape)
+        if self.use_dual_attention:
+            self.attention_qkv2.build(inputs_shape)
+            if self.qk_norm is not None:
+                self.q_norm2.build([None, None, self.num_heads, self.head_dim])
+                self.k_norm2.build([None, None, self.num_heads, self.head_dim])
+            if self.use_projection:
+                self.attention_proj2.build(inputs_shape)
     def _modulate(self, inputs, shift, scale):
-        shift = ops.expand_dims(shift, axis=1)
-        scale = ops.expand_dims(scale, axis=1)
+        inputs = ops.cast(inputs, self.compute_dtype)
+        shift = ops.cast(shift, self.compute_dtype)
+        scale = ops.cast(scale, self.compute_dtype)
         return ops.add(ops.multiply(inputs, ops.add(scale, 1.0)), shift)
     def _compute_pre_attention(self, inputs, timestep_embedding, training=None):
         batch_size = ops.shape(inputs)[0]
         if self.use_projection:
-            modulation = self.adaptive_norm_modulation(
-                timestep_embedding, training=training
-            )
-            modulation = ops.reshape(
-                modulation, (batch_size, 6, self.hidden_dim)
-            )
-            (
-                shift_msa,
-                scale_msa,
-                gate_msa,
-                shift_mlp,
-                scale_mlp,
-                gate_mlp,
-            ) = ops.unstack(modulation, 6, axis=1)
-            qkv = self.attention_qkv(
-                self._modulate(self.norm1(inputs), shift_msa, scale_msa),
-                training=training,
+            x, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.ada_layer_norm(
+                inputs, timestep_embedding, training=training
             )
+            qkv = self.attention_qkv(x, training=training)
             qkv = ops.reshape(
                 qkv, (batch_size, -1, 3, self.num_heads, self.head_dim)
             )
             q, k, v = ops.unstack(qkv, 3, axis=2)
+            if self.qk_norm is not None:
+                q = ops.cast(
+                    self.q_norm(q, training=training), self.compute_dtype
+                )
+                k = ops.cast(
+                    self.k_norm(k, training=training), self.compute_dtype
+                )
             return (q, k, v), (inputs, gate_msa, shift_mlp, scale_mlp, gate_mlp)
         else:
-            modulation = self.adaptive_norm_modulation(
-                timestep_embedding, training=training
-            )
-            modulation = ops.reshape(
-                modulation, (batch_size, 2, self.hidden_dim)
-            )
-            shift_msa, scale_msa = ops.unstack(modulation, 2, axis=1)
-            qkv = self.attention_qkv(
-                self._modulate(self.norm1(inputs), shift_msa, scale_msa),
-                training=training,
+            x = self.ada_layer_norm(
+                inputs, timestep_embedding, training=training
             )
+            qkv = self.attention_qkv(x, training=training)
             qkv = ops.reshape(
                 qkv, (batch_size, -1, 3, self.num_heads, self.head_dim)
             )
             q, k, v = ops.unstack(qkv, 3, axis=2)
+            if self.qk_norm is not None:
+                q = ops.cast(
+                    self.q_norm(q, training=training), self.compute_dtype
+                )
+                k = ops.cast(
+                    self.k_norm(k, training=training), self.compute_dtype
+                )
             return (q, k, v)
     def _compute_post_attention(
         self, inputs, inputs_intermediates, training=None
     ):
         x, gate_msa, shift_mlp, scale_mlp, gate_mlp = inputs_intermediates
+        gate_msa = ops.expand_dims(gate_msa, axis=1)
+        shift_mlp = ops.expand_dims(shift_mlp, axis=1)
+        scale_mlp = ops.expand_dims(scale_mlp, axis=1)
+        gate_mlp = ops.expand_dims(gate_mlp, axis=1)
         attn = self.attention_proj(inputs, training=training)
-        x = ops.add(x, ops.multiply(ops.expand_dims(gate_msa, axis=1), attn))
+        x = ops.add(x, ops.multiply(gate_msa, attn))
         x = ops.add(
             x,
             ops.multiply(
-                ops.expand_dims(gate_mlp, axis=1),
+                gate_mlp,
+                self.mlp(
+                    self._modulate(self.norm2(x), shift_mlp, scale_mlp),
+                    training=training,
+                ),
+            ),
+        )
+        return x
+    def _compute_pre_attention_with_dual_attention(
+        self, inputs, timestep_embedding, training=None
+    ):
+        batch_size = ops.shape(inputs)[0]
+        x, gate_msa, shift_mlp, scale_mlp, gate_mlp, x2, gate_msa2 = (
+            self.ada_layer_norm(inputs, timestep_embedding, training=training)
+        )
+        # Compute the main attention
+        qkv = self.attention_qkv(x, training=training)
+        qkv = ops.reshape(
+            qkv, (batch_size, -1, 3, self.num_heads, self.head_dim)
+        )
+        q, k, v = ops.unstack(qkv, 3, axis=2)
+        if self.qk_norm is not None:
+            q = ops.cast(self.q_norm(q, training=training), self.compute_dtype)
+            k = ops.cast(self.k_norm(k, training=training), self.compute_dtype)
+        # Compute the dual attention
+        qkv2 = self.attention_qkv2(x2, training=training)
+        qkv2 = ops.reshape(
+            qkv2, (batch_size, -1, 3, self.num_heads, self.head_dim)
+        )
+        q2, k2, v2 = ops.unstack(qkv2, 3, axis=2)
+        if self.qk_norm is not None:
+            q2 = ops.cast(
+                self.q_norm2(q2, training=training), self.compute_dtype
+            )
+            k2 = ops.cast(
+                self.k_norm2(k2, training=training), self.compute_dtype
+            )
+        return (
+            (q, k, v),
+            (q2, k2, v2),
+            (inputs, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2),
+        )
+    def _compute_post_attention_with_dual_attention(
+        self, inputs, inputs2, inputs_intermediates, training=None
+    ):
+        x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2 = (
+            inputs_intermediates
+        )
+        gate_msa = ops.expand_dims(gate_msa, axis=1)
+        shift_mlp = ops.expand_dims(shift_mlp, axis=1)
+        scale_mlp = ops.expand_dims(scale_mlp, axis=1)
+        gate_mlp = ops.expand_dims(gate_mlp, axis=1)
+        gate_msa2 = ops.expand_dims(gate_msa2, axis=1)
+        attn = self.attention_proj(inputs, training=training)
+        x = ops.add(x, ops.multiply(gate_msa, attn))
+        attn2 = self.attention_proj2(inputs2, training=training)
+        x = ops.add(x, ops.multiply(gate_msa2, attn2))
+        x = ops.add(
+            x,
+            ops.multiply(
+                gate_mlp,
                 self.mlp(
                     self._modulate(self.norm2(x), shift_mlp, scale_mlp),
                     training=training,
@@ -302,17 +656,28 @@ class DismantledBlock(layers.Layer):
         inputs,
         timestep_embedding=None,
         inputs_intermediates=None,
+        inputs2=None,  # For the dual attention.
         pre_attention=True,
         training=None,
     ):
         if pre_attention:
-            return self._compute_pre_attention(
-                inputs, timestep_embedding, training=training
-            )
+            if self.use_dual_attention:
+                return self._compute_pre_attention_with_dual_attention(
+                    inputs, timestep_embedding, training=training
+                )
+            else:
+                return self._compute_pre_attention(
+                    inputs, timestep_embedding, training=training
+                )
         else:
-            return self._compute_post_attention(
-                inputs, inputs_intermediates, training=training
-            )
+            if self.use_dual_attention:
+                return self._compute_post_attention_with_dual_attention(
+                    inputs, inputs2, inputs_intermediates, training=training
+                )
+            else:
+                return self._compute_post_attention(
+                    inputs, inputs_intermediates, training=training
+                )
     def get_config(self):
         config = super().get_config()
@@ -322,18 +687,47 @@ class DismantledBlock(layers.Layer):
                 "hidden_dim": self.hidden_dim,
                 "mlp_ratio": self.mlp_ratio,
                 "use_projection": self.use_projection,
+                "qk_norm": self.qk_norm,
+                "use_dual_attention": self.use_dual_attention,
             }
         )
         return config
 class MMDiTBlock(layers.Layer):
+    """A MMDiT block consisting of two `DismantledBlock` layers.
+    One `DismantledBlock` processes the input latents, and the other processes
+    the context embedding. This block integrates two modalities within the
+    attention operation, allowing each representation to operate in its own
+    space while considering the other.
+    Args:
+        num_heads: int. Number of attention heads.
+        hidden_dim: int. The number of units in the hidden layers.
+        mlp_ratio: float. The expansion ratio of `MLP`.
+        use_context_projection: bool. Whether to use an attention projection
+            layer at the end of the context block.
+        qk_norm: Optional str. Whether to normalize the query and key tensors.
+            Available options are `None` and `"rms_norm"`. Defaults to `None`.
+        use_dual_attention: bool. Whether to use a dual attention in the
+            block. Defaults to `False`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    Reference:
+    - [Scaling Rectified Flow Transformers for High-Resolution Image Synthesis](
+    https://arxiv.org/abs/2403.03206)
+    """
     def __init__(
         self,
         num_heads,
         hidden_dim,
         mlp_ratio=4.0,
         use_context_projection=True,
+        qk_norm=None,
+        use_dual_attention=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -341,18 +735,20 @@ class MMDiTBlock(layers.Layer):
         self.hidden_dim = hidden_dim
         self.mlp_ratio = mlp_ratio
         self.use_context_projection = use_context_projection
+        self.qk_norm = qk_norm
+        self.use_dual_attention = use_dual_attention
         head_dim = hidden_dim // num_heads
         self.head_dim = head_dim
         self._inverse_sqrt_key_dim = 1.0 / math.sqrt(head_dim)
-        self._dot_product_equation = "aecd,abcd->acbe"
-        self._combine_equation = "acbe,aecd->abcd"
         self.x_block = DismantledBlock(
             num_heads=num_heads,
             hidden_dim=hidden_dim,
             mlp_ratio=mlp_ratio,
             use_projection=True,
+            qk_norm=qk_norm,
+            use_dual_attention=use_dual_attention,
             dtype=self.dtype_policy,
             name="x_block",
         )
@@ -361,6 +757,7 @@ class MMDiTBlock(layers.Layer):
             hidden_dim=hidden_dim,
             mlp_ratio=mlp_ratio,
             use_projection=use_context_projection,
+            qk_norm=qk_norm,
             dtype=self.dtype_policy,
             name="context_block",
         )
@@ -371,20 +768,35 @@ class MMDiTBlock(layers.Layer):
         self.context_block.build(context_shape, timestep_embedding_shape)
     def _compute_attention(self, query, key, value):
-        query = ops.multiply(
-            query, ops.cast(self._inverse_sqrt_key_dim, query.dtype)
-        )
-        attention_scores = ops.einsum(self._dot_product_equation, key, query)
-        attention_scores = self.softmax(attention_scores)
-        attention_scores = ops.cast(attention_scores, self.compute_dtype)
-        attention_output = ops.einsum(
-            self._combine_equation, attention_scores, value
-        )
-        batch_size = ops.shape(attention_output)[0]
-        attention_output = ops.reshape(
-            attention_output, (batch_size, -1, self.num_heads * self.head_dim)
+        batch_size = ops.shape(query)[0]
+        # Use the fast path when `ops.dot_product_attention` and flash attention
+        # are available.
+        if hasattr(ops, "dot_product_attention") and hasattr(
+            keras.config, "is_flash_attention_enabled"
+        ):
+            encoded = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                scale=self._inverse_sqrt_key_dim,
+                flash_attention=keras.config.is_flash_attention_enabled(),
+            )
+            return ops.reshape(
+                encoded, (batch_size, -1, self.num_heads * self.head_dim)
+            )
+        # Ref: jax.nn.dot_product_attention
+        # https://github.com/jax-ml/jax/blob/db89c245ac66911c98f265a05956fdfa4bc79d83/jax/_src/nn/functions.py#L846
+        logits = ops.einsum("BTNH,BSNH->BNTS", query, key)
+        logits = ops.multiply(logits, self._inverse_sqrt_key_dim)
+        probs = self.softmax(logits)
+        probs = ops.cast(probs, self.compute_dtype)
+        encoded = ops.einsum("BNTS,BSNH->BTNH", probs, value)
+        encoded = ops.reshape(
+            encoded, (batch_size, -1, self.num_heads * self.head_dim)
         )
-        return attention_output
+        return encoded
     def call(self, inputs, context, timestep_embedding, training=None):
         # Compute pre-attention.
@@ -402,9 +814,14 @@ class MMDiTBlock(layers.Layer):
                 training=training,
             )
         context_len = ops.shape(context_qkv[0])[1]
-        x_qkv, x_intermediates = self.x_block(
-            x, timestep_embedding=timestep_embedding, training=training
-        )
+        if self.x_block.use_dual_attention:
+            x_qkv, x_qkv2, x_intermediates = self.x_block(
+                x, timestep_embedding=timestep_embedding, training=training
+            )
+        else:
+            x_qkv, x_intermediates = self.x_block(
+                x, timestep_embedding=timestep_embedding, training=training
+            )
         q = ops.concatenate([context_qkv[0], x_qkv[0]], axis=1)
         k = ops.concatenate([context_qkv[1], x_qkv[1]], axis=1)
         v = ops.concatenate([context_qkv[2], x_qkv[2]], axis=1)
@@ -415,12 +832,23 @@ class MMDiTBlock(layers.Layer):
         x_attention = attention[:, context_len:]
         # Compute post-attention.
-        x = self.x_block(
-            x_attention,
-            inputs_intermediates=x_intermediates,
-            pre_attention=False,
-            training=training,
-        )
+        if self.x_block.use_dual_attention:
+            q2, k2, v2 = x_qkv2
+            x_attention2 = self._compute_attention(q2, k2, v2)
+            x = self.x_block(
+                x_attention,
+                inputs_intermediates=x_intermediates,
+                inputs2=x_attention2,
+                pre_attention=False,
+                training=training,
+            )
+        else:
+            x = self.x_block(
+                x_attention,
+                inputs_intermediates=x_intermediates,
+                pre_attention=False,
+                training=training,
+            )
         if self.use_context_projection:
             context = self.context_block(
                 context_attention,
@@ -440,6 +868,8 @@ class MMDiTBlock(layers.Layer):
                 "hidden_dim": self.hidden_dim,
                 "mlp_ratio": self.mlp_ratio,
                 "use_context_projection": self.use_context_projection,
+                "qk_norm": self.qk_norm,
+                "use_dual_attention": self.use_dual_attention,
             }
         )
         return config
@@ -453,74 +883,16 @@ class MMDiTBlock(layers.Layer):
             return inputs_shape
-class OutputLayer(layers.Layer):
-    def __init__(self, hidden_dim, output_dim, **kwargs):
-        super().__init__(**kwargs)
-        self.hidden_dim = hidden_dim
-        self.output_dim = output_dim
-        num_modulation = 2
-        self.adaptive_norm_modulation = models.Sequential(
-            [
-                layers.Activation("silu", dtype=self.dtype_policy),
-                layers.Dense(
-                    num_modulation * hidden_dim, dtype=self.dtype_policy
-                ),
-            ],
-            name="adaptive_norm_modulation",
-        )
-        self.norm = layers.LayerNormalization(
-            epsilon=1e-6,
-            center=False,
-            scale=False,
-            dtype="float32",
-            name="norm",
-        )
-        self.output_dense = layers.Dense(
-            output_dim,
-            use_bias=True,
-            dtype=self.dtype_policy,
-            name="output_dense",
-        )
-    def build(self, inputs_shape, timestep_embedding_shape):
-        self.adaptive_norm_modulation.build(timestep_embedding_shape)
-        self.norm.build(inputs_shape)
-        self.output_dense.build(inputs_shape)
-    def _modulate(self, inputs, shift, scale):
-        shift = ops.expand_dims(shift, axis=1)
-        scale = ops.expand_dims(scale, axis=1)
-        return ops.add(ops.multiply(inputs, ops.add(scale, 1.0)), shift)
-    def call(self, inputs, timestep_embedding, training=None):
-        x = inputs
-        modulation = self.adaptive_norm_modulation(
-            timestep_embedding, training=training
-        )
-        modulation = ops.reshape(modulation, (-1, 2, self.hidden_dim))
-        shift, scale = ops.unstack(modulation, 2, axis=1)
-        x = self._modulate(self.norm(x), shift, scale)
-        x = self.output_dense(x, training=training)
-        return x
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "hidden_dim": self.hidden_dim,
-                "output_dim": self.output_dim,
-            }
-        )
-        return config
-    def compute_output_shape(self, inputs_shape):
-        outputs_shape = list(inputs_shape)
-        outputs_shape[-1] = self.output_dim
-        return outputs_shape
+class Unpatch(layers.Layer):
+    """A layer that reconstructs the image from hidden patches.
+    Args:
+        patch_size: int. The size of each square patch in the input image.
+        output_dim: int. The number of units in the output layer.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
-class Unpatch(layers.Layer):
     def __init__(self, patch_size, output_dim, **kwargs):
         super().__init__(**kwargs)
         self.patch_size = int(patch_size)
@@ -556,7 +928,7 @@ class Unpatch(layers.Layer):
 class MMDiT(Backbone):
-    """Multimodal Diffusion Transformer (MMDiT) model for Stable Diffusion 3.
+    """A Multimodal Diffusion Transformer (MMDiT) model.
     MMDiT is introduced in [
     Scaling Rectified Flow Transformers for High-Resolution Image Synthesis](
@@ -574,6 +946,12 @@ class MMDiT(Backbone):
         latent_shape: tuple. The shape of the latent image.
         context_shape: tuple. The shape of the context.
         pooled_projection_shape: tuple. The shape of the pooled projection.
+        qk_norm: Optional str. Whether to normalize the query and key tensors in
+            the intermediate blocks. Available options are `None` and
+            `"rms_norm"`. Defaults to `None`.
+        dual_attention_indices: Optional tuple. Specifies the indices of
+            the blocks that serve as dual attention blocks. Typically, this is
+            for 3.5 version. Defaults to `None`.
         data_format: `None` or str. If specified, either `"channels_last"` or
             `"channels_first"`. The ordering of the dimensions in the
             inputs. `"channels_last"` corresponds to inputs with shape
@@ -598,6 +976,8 @@ class MMDiT(Backbone):
         latent_shape=(64, 64, 16),
         context_shape=(None, 4096),
         pooled_projection_shape=(2048,),
+        qk_norm=None,
+        dual_attention_indices=None,
         data_format=None,
         dtype=None,
         **kwargs,
@@ -611,6 +991,7 @@ class MMDiT(Backbone):
         image_width = latent_shape[1] // patch_size
         output_dim = latent_shape[-1]
         output_dim_in_final = patch_size**2 * output_dim
+        dual_attention_indices = dual_attention_indices or ()
         data_format = standardize_data_format(data_format)
         if data_format != "channels_last":
             raise NotImplementedError(
@@ -636,12 +1017,8 @@ class MMDiT(Backbone):
             dtype=dtype,
             name="context_embedding",
         )
-        self.vector_embedding = models.Sequential(
-            [
-                layers.Dense(hidden_dim, activation="silu", dtype=dtype),
-                layers.Dense(hidden_dim, activation=None, dtype=dtype),
-            ],
-            name="vector_embedding",
+        self.vector_embedding = MLP(
+            hidden_dim, hidden_dim, "silu", dtype=dtype, name="vector_embedding"
         )
         self.vector_embedding_add = layers.Add(
             dtype=dtype, name="vector_embedding_add"
@@ -655,13 +1032,18 @@ class MMDiT(Backbone):
                 hidden_dim,
                 mlp_ratio,
                 use_context_projection=not (i == num_layers - 1),
+                qk_norm=qk_norm,
+                use_dual_attention=i in dual_attention_indices,
                 dtype=dtype,
                 name=f"joint_block_{i}",
             )
             for i in range(num_layers)
         ]
-        self.output_layer = OutputLayer(
-            hidden_dim, output_dim_in_final, dtype=dtype, name="output_layer"
+        self.output_ada_layer_norm = AdaptiveLayerNormalization(
+            hidden_dim, dtype=dtype, name="output_ada_layer_norm"
+        )
+        self.output_dense = layers.Dense(
+            output_dim_in_final, dtype=dtype, name="output_dense"
         )
         self.unpatch = Unpatch(
             patch_size, output_dim, dtype=dtype, name="unpatch"
@@ -696,7 +1078,8 @@ class MMDiT(Backbone):
                 x = block(x, context, timestep_embedding)
         # Output layer.
-        x = self.output_layer(x, timestep_embedding)
+        x = self.output_ada_layer_norm(x, timestep_embedding)
+        x = self.output_dense(x)
         outputs = self.unpatch(x, height=image_height, width=image_width)
         super().__init__(
@@ -720,6 +1103,8 @@ class MMDiT(Backbone):
         self.latent_shape = latent_shape
         self.context_shape = context_shape
         self.pooled_projection_shape = pooled_projection_shape
+        self.qk_norm = qk_norm
+        self.dual_attention_indices = dual_attention_indices
     def get_config(self):
         config = super().get_config()
@@ -734,6 +1119,8 @@ class MMDiT(Backbone):
                 "latent_shape": self.latent_shape,
                 "context_shape": self.context_shape,
                 "pooled_projection_shape": self.pooled_projection_shape,
+                "qk_norm": self.qk_norm,
+                "dual_attention_indices": self.dual_attention_indices,
             }
         )
         return config

keras-hub-nightly 0.16.1.dev202410020340__py3-none-any.whl → 0.19.0.dev202501260345__py3-none-any.whl

keras-hub-nightly 0.16.1.dev202410020340py3-none-any.whl → 0.19.0.dev202501260345py3-none-any.whl