PyPI - keras-hub-nightly - Versions diffs - 0.16.1.dev202410020340__py3-none-any.whl → 0.19.0.dev202501260345__py3-none-any.whl - Mend

keras-hub-nightly 0.16.1.dev202410020340py3-none-any.whl → 0.19.0.dev202501260345py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (252) hide show

keras_hub/api/layers/__init__.py +21 -3
keras_hub/api/models/__init__.py +71 -12
keras_hub/api/tokenizers/__init__.py +1 -1
keras_hub/src/bounding_box/__init__.py +2 -0
keras_hub/src/bounding_box/converters.py +102 -12
keras_hub/src/layers/modeling/f_net_encoder.py +1 -1
keras_hub/src/layers/modeling/masked_lm_head.py +2 -1
keras_hub/src/layers/modeling/reversible_embedding.py +3 -16
keras_hub/src/layers/modeling/rms_normalization.py +36 -0
keras_hub/src/layers/modeling/rotary_embedding.py +3 -2
keras_hub/src/layers/modeling/token_and_position_embedding.py +1 -1
keras_hub/src/layers/modeling/transformer_decoder.py +8 -6
keras_hub/src/layers/modeling/transformer_encoder.py +29 -7
keras_hub/src/layers/preprocessing/audio_converter.py +3 -7
keras_hub/src/layers/preprocessing/image_converter.py +170 -34
keras_hub/src/metrics/bleu.py +4 -3
keras_hub/src/models/albert/albert_presets.py +4 -12
keras_hub/src/models/albert/albert_text_classifier.py +7 -7
keras_hub/src/models/backbone.py +3 -14
keras_hub/src/models/bart/bart_backbone.py +4 -4
keras_hub/src/models/bart/bart_presets.py +3 -9
keras_hub/src/models/bart/bart_seq_2_seq_lm.py +9 -8
keras_hub/src/models/basnet/__init__.py +5 -0
keras_hub/src/models/basnet/basnet.py +122 -0
keras_hub/src/models/basnet/basnet_backbone.py +366 -0
keras_hub/src/models/basnet/basnet_image_converter.py +8 -0
keras_hub/src/models/basnet/basnet_preprocessor.py +14 -0
keras_hub/src/models/basnet/basnet_presets.py +17 -0
keras_hub/src/models/bert/bert_presets.py +14 -32
keras_hub/src/models/bert/bert_text_classifier.py +3 -3
keras_hub/src/models/bloom/bloom_presets.py +8 -24
keras_hub/src/models/causal_lm.py +56 -12
keras_hub/src/models/clip/__init__.py +5 -0
keras_hub/src/models/clip/clip_backbone.py +286 -0
keras_hub/src/models/clip/clip_encoder_block.py +19 -4
keras_hub/src/models/clip/clip_image_converter.py +8 -0
keras_hub/src/models/clip/clip_presets.py +93 -0
keras_hub/src/models/clip/clip_text_encoder.py +4 -1
keras_hub/src/models/clip/clip_tokenizer.py +18 -3
keras_hub/src/models/clip/clip_vision_embedding.py +101 -0
keras_hub/src/models/clip/clip_vision_encoder.py +159 -0
keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +2 -1
keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +0 -109
keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +1 -1
keras_hub/src/models/deberta_v3/deberta_v3_presets.py +5 -15
keras_hub/src/models/deberta_v3/deberta_v3_text_classifier.py +4 -4
keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +4 -4
keras_hub/src/models/deberta_v3/disentangled_self_attention.py +3 -2
keras_hub/src/models/deberta_v3/relative_embedding.py +1 -1
keras_hub/src/models/deeplab_v3/__init__.py +7 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_backbone.py +200 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_image_converter.py +10 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_image_segmeter_preprocessor.py +16 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_layers.py +215 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_presets.py +17 -0
keras_hub/src/models/deeplab_v3/deeplab_v3_segmenter.py +111 -0
keras_hub/src/models/densenet/densenet_backbone.py +6 -4
keras_hub/src/models/densenet/densenet_image_classifier.py +1 -129
keras_hub/src/models/densenet/densenet_image_converter.py +2 -4
keras_hub/src/models/densenet/densenet_presets.py +9 -15
keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +1 -1
keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +2 -2
keras_hub/src/models/distil_bert/distil_bert_presets.py +5 -10
keras_hub/src/models/distil_bert/distil_bert_text_classifier.py +5 -5
keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +3 -3
keras_hub/src/models/efficientnet/__init__.py +9 -0
keras_hub/src/models/efficientnet/cba.py +141 -0
keras_hub/src/models/efficientnet/efficientnet_backbone.py +160 -61
keras_hub/src/models/efficientnet/efficientnet_image_classifier.py +14 -0
keras_hub/src/models/efficientnet/efficientnet_image_classifier_preprocessor.py +16 -0
keras_hub/src/models/efficientnet/efficientnet_image_converter.py +10 -0
keras_hub/src/models/efficientnet/efficientnet_presets.py +193 -0
keras_hub/src/models/efficientnet/fusedmbconv.py +84 -41
keras_hub/src/models/efficientnet/mbconv.py +53 -22
keras_hub/src/models/electra/electra_backbone.py +2 -2
keras_hub/src/models/electra/electra_presets.py +6 -18
keras_hub/src/models/f_net/f_net_presets.py +2 -6
keras_hub/src/models/f_net/f_net_text_classifier.py +3 -3
keras_hub/src/models/f_net/f_net_text_classifier_preprocessor.py +3 -3
keras_hub/src/models/falcon/falcon_backbone.py +5 -3
keras_hub/src/models/falcon/falcon_causal_lm.py +18 -8
keras_hub/src/models/falcon/falcon_presets.py +1 -3
keras_hub/src/models/falcon/falcon_tokenizer.py +7 -2
keras_hub/src/models/feature_pyramid_backbone.py +1 -1
keras_hub/src/models/flux/__init__.py +5 -0
keras_hub/src/models/flux/flux_layers.py +496 -0
keras_hub/src/models/flux/flux_maths.py +225 -0
keras_hub/src/models/flux/flux_model.py +236 -0
keras_hub/src/models/flux/flux_presets.py +3 -0
keras_hub/src/models/flux/flux_text_to_image.py +146 -0
keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +73 -0
keras_hub/src/models/gemma/gemma_backbone.py +35 -20
keras_hub/src/models/gemma/gemma_causal_lm.py +2 -2
keras_hub/src/models/gemma/gemma_decoder_block.py +3 -1
keras_hub/src/models/gemma/gemma_presets.py +29 -63
keras_hub/src/models/gpt2/gpt2_causal_lm.py +2 -2
keras_hub/src/models/gpt2/gpt2_presets.py +5 -14
keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +2 -1
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +3 -3
keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +2 -1
keras_hub/src/models/image_classifier.py +147 -2
keras_hub/src/models/image_classifier_preprocessor.py +6 -3
keras_hub/src/models/image_object_detector.py +87 -0
keras_hub/src/models/image_object_detector_preprocessor.py +57 -0
keras_hub/src/models/image_segmenter.py +0 -5
keras_hub/src/models/image_segmenter_preprocessor.py +29 -4
keras_hub/src/models/image_to_image.py +417 -0
keras_hub/src/models/inpaint.py +520 -0
keras_hub/src/models/llama/llama_backbone.py +138 -12
keras_hub/src/models/llama/llama_causal_lm.py +3 -1
keras_hub/src/models/llama/llama_presets.py +10 -20
keras_hub/src/models/llama3/llama3_backbone.py +12 -11
keras_hub/src/models/llama3/llama3_causal_lm.py +1 -1
keras_hub/src/models/llama3/llama3_presets.py +4 -12
keras_hub/src/models/llama3/llama3_tokenizer.py +25 -2
keras_hub/src/models/mistral/mistral_backbone.py +16 -15
keras_hub/src/models/mistral/mistral_causal_lm.py +6 -4
keras_hub/src/models/mistral/mistral_presets.py +3 -9
keras_hub/src/models/mistral/mistral_transformer_decoder.py +2 -1
keras_hub/src/models/mit/__init__.py +6 -0
keras_hub/src/models/{mix_transformer/mix_transformer_backbone.py → mit/mit_backbone.py} +47 -36
keras_hub/src/models/mit/mit_image_classifier.py +12 -0
keras_hub/src/models/mit/mit_image_classifier_preprocessor.py +12 -0
keras_hub/src/models/mit/mit_image_converter.py +8 -0
keras_hub/src/models/{mix_transformer/mix_transformer_layers.py → mit/mit_layers.py} +20 -13
keras_hub/src/models/mit/mit_presets.py +139 -0
keras_hub/src/models/mobilenet/mobilenet_backbone.py +8 -8
keras_hub/src/models/mobilenet/mobilenet_image_classifier.py +0 -92
keras_hub/src/models/opt/opt_causal_lm.py +2 -2
keras_hub/src/models/opt/opt_presets.py +4 -12
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +63 -17
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +3 -1
keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +21 -23
keras_hub/src/models/pali_gemma/pali_gemma_image_converter.py +2 -4
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +173 -17
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +14 -26
keras_hub/src/models/phi3/phi3_causal_lm.py +3 -1
keras_hub/src/models/phi3/phi3_decoder.py +0 -1
keras_hub/src/models/phi3/phi3_presets.py +2 -6
keras_hub/src/models/phi3/phi3_rotary_embedding.py +1 -1
keras_hub/src/models/preprocessor.py +25 -11
keras_hub/src/models/resnet/resnet_backbone.py +3 -14
keras_hub/src/models/resnet/resnet_image_classifier.py +0 -137
keras_hub/src/models/resnet/resnet_image_converter.py +2 -4
keras_hub/src/models/resnet/resnet_presets.py +127 -18
keras_hub/src/models/retinanet/__init__.py +5 -0
keras_hub/src/models/retinanet/anchor_generator.py +52 -53
keras_hub/src/models/retinanet/feature_pyramid.py +103 -39
keras_hub/src/models/retinanet/non_max_supression.py +1 -0
keras_hub/src/models/retinanet/prediction_head.py +192 -0
keras_hub/src/models/retinanet/retinanet_backbone.py +146 -0
keras_hub/src/models/retinanet/retinanet_image_converter.py +53 -0
keras_hub/src/models/retinanet/retinanet_label_encoder.py +49 -51
keras_hub/src/models/retinanet/retinanet_object_detector.py +381 -0
keras_hub/src/models/retinanet/retinanet_object_detector_preprocessor.py +14 -0
keras_hub/src/models/retinanet/retinanet_presets.py +16 -0
keras_hub/src/models/roberta/roberta_backbone.py +2 -2
keras_hub/src/models/roberta/roberta_presets.py +6 -8
keras_hub/src/models/roberta/roberta_text_classifier.py +3 -3
keras_hub/src/models/sam/__init__.py +5 -0
keras_hub/src/models/sam/sam_backbone.py +2 -3
keras_hub/src/models/sam/sam_image_converter.py +2 -4
keras_hub/src/models/sam/sam_image_segmenter.py +16 -16
keras_hub/src/models/sam/sam_image_segmenter_preprocessor.py +11 -1
keras_hub/src/models/sam/sam_layers.py +5 -3
keras_hub/src/models/sam/sam_presets.py +3 -9
keras_hub/src/models/sam/sam_prompt_encoder.py +4 -2
keras_hub/src/models/sam/sam_transformer.py +5 -4
keras_hub/src/models/segformer/__init__.py +8 -0
keras_hub/src/models/segformer/segformer_backbone.py +167 -0
keras_hub/src/models/segformer/segformer_image_converter.py +8 -0
keras_hub/src/models/segformer/segformer_image_segmenter.py +184 -0
keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py +31 -0
keras_hub/src/models/segformer/segformer_presets.py +136 -0
keras_hub/src/models/seq_2_seq_lm_preprocessor.py +1 -1
keras_hub/src/models/stable_diffusion_3/flow_match_euler_discrete_scheduler.py +8 -1
keras_hub/src/models/stable_diffusion_3/mmdit.py +577 -190
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +189 -163
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +178 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +193 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py +43 -7
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +25 -14
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +1 -1
keras_hub/src/models/t5/t5_backbone.py +5 -4
keras_hub/src/models/t5/t5_presets.py +47 -19
keras_hub/src/models/task.py +47 -39
keras_hub/src/models/text_classifier.py +2 -2
keras_hub/src/models/text_to_image.py +106 -41
keras_hub/src/models/vae/__init__.py +1 -0
keras_hub/src/models/vae/vae_backbone.py +184 -0
keras_hub/src/models/vae/vae_layers.py +739 -0
keras_hub/src/models/vgg/__init__.py +5 -0
keras_hub/src/models/vgg/vgg_backbone.py +4 -24
keras_hub/src/models/vgg/vgg_image_classifier.py +139 -33
keras_hub/src/models/vgg/vgg_image_classifier_preprocessor.py +12 -0
keras_hub/src/models/vgg/vgg_image_converter.py +8 -0
keras_hub/src/models/vgg/vgg_presets.py +48 -0
keras_hub/src/models/vit/__init__.py +5 -0
keras_hub/src/models/vit/vit_backbone.py +152 -0
keras_hub/src/models/vit/vit_image_classifier.py +187 -0
keras_hub/src/models/vit/vit_image_classifier_preprocessor.py +12 -0
keras_hub/src/models/vit/vit_image_converter.py +73 -0
keras_hub/src/models/vit/vit_layers.py +391 -0
keras_hub/src/models/vit/vit_presets.py +126 -0
keras_hub/src/models/vit_det/vit_det_backbone.py +6 -4
keras_hub/src/models/vit_det/vit_layers.py +3 -3
keras_hub/src/models/whisper/whisper_audio_converter.py +2 -4
keras_hub/src/models/whisper/whisper_backbone.py +6 -5
keras_hub/src/models/whisper/whisper_decoder.py +3 -5
keras_hub/src/models/whisper/whisper_presets.py +10 -30
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +1 -1
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +2 -2
keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +2 -6
keras_hub/src/models/xlm_roberta/xlm_roberta_text_classifier.py +4 -4
keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +2 -1
keras_hub/src/models/xlnet/relative_attention.py +20 -19
keras_hub/src/models/xlnet/xlnet_backbone.py +2 -2
keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +3 -5
keras_hub/src/models/xlnet/xlnet_encoder.py +7 -9
keras_hub/src/samplers/contrastive_sampler.py +2 -3
keras_hub/src/samplers/sampler.py +2 -1
keras_hub/src/tests/test_case.py +41 -6
keras_hub/src/tokenizers/byte_pair_tokenizer.py +7 -3
keras_hub/src/tokenizers/byte_tokenizer.py +3 -10
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +2 -9
keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +9 -11
keras_hub/src/tokenizers/tokenizer.py +10 -13
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +9 -7
keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +10 -3
keras_hub/src/utils/keras_utils.py +2 -13
keras_hub/src/utils/pipeline_model.py +3 -3
keras_hub/src/utils/preset_utils.py +196 -144
keras_hub/src/utils/tensor_utils.py +4 -4
keras_hub/src/utils/timm/convert_densenet.py +6 -4
keras_hub/src/utils/timm/convert_efficientnet.py +447 -0
keras_hub/src/utils/timm/convert_resnet.py +1 -1
keras_hub/src/utils/timm/convert_vgg.py +85 -0
keras_hub/src/utils/timm/preset_loader.py +14 -9
keras_hub/src/utils/transformers/convert_llama3.py +21 -5
keras_hub/src/utils/transformers/convert_vit.py +150 -0
keras_hub/src/utils/transformers/preset_loader.py +23 -0
keras_hub/src/utils/transformers/safetensor_utils.py +4 -3
keras_hub/src/version_utils.py +1 -1
{keras_hub_nightly-0.16.1.dev202410020340.dist-info → keras_hub_nightly-0.19.0.dev202501260345.dist-info}/METADATA +86 -68
keras_hub_nightly-0.19.0.dev202501260345.dist-info/RECORD +423 -0
{keras_hub_nightly-0.16.1.dev202410020340.dist-info → keras_hub_nightly-0.19.0.dev202501260345.dist-info}/WHEEL +1 -1
keras_hub/src/layers/preprocessing/resizing_image_converter.py +0 -138
keras_hub/src/models/mix_transformer/__init__.py +0 -0
keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +0 -119
keras_hub/src/models/stable_diffusion_3/vae_image_decoder.py +0 -320
keras_hub_nightly-0.16.1.dev202410020340.dist-info/RECORD +0 -357
{keras_hub_nightly-0.16.1.dev202410020340.dist-info → keras_hub_nightly-0.19.0.dev202501260345.dist-info}/top_level.txt +0 -0

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py CHANGED Viewed

@@ -4,13 +4,10 @@ from keras import ops
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.backbone import Backbone
-from keras_hub.src.models.stable_diffusion_3.flow_match_euler_discrete_scheduler import (
+from keras_hub.src.models.stable_diffusion_3.flow_match_euler_discrete_scheduler import (  # noqa: E501
     FlowMatchEulerDiscreteScheduler,
 )
 from keras_hub.src.models.stable_diffusion_3.mmdit import MMDiT
-from keras_hub.src.models.stable_diffusion_3.vae_image_decoder import (
-    VAEImageDecoder,
-)
 from keras_hub.src.utils.keras_utils import standardize_data_format
@@ -54,11 +51,52 @@ class CLIPProjection(layers.Layer):
         return (inputs_shape[0], self.hidden_dim)
-class ClassifierFreeGuidanceConcatenate(layers.Layer):
-    def __init__(self, axis=0, **kwargs):
-        super().__init__(**kwargs)
-        self.axis = axis
+class CLIPConcatenate(layers.Layer):
+    def call(
+        self,
+        clip_l_projection,
+        clip_g_projection,
+        clip_l_intermediate_output,
+        clip_g_intermediate_output,
+        padding,
+    ):
+        pooled_embeddings = ops.concatenate(
+            [clip_l_projection, clip_g_projection], axis=-1
+        )
+        embeddings = ops.concatenate(
+            [clip_l_intermediate_output, clip_g_intermediate_output], axis=-1
+        )
+        embeddings = ops.pad(embeddings, [[0, 0], [0, 0], [0, padding]])
+        return pooled_embeddings, embeddings
+class ImageRescaling(layers.Rescaling):
+    """Rescales inputs from image space to latent space.
+    The rescaling is performed using the formula: `(inputs - offset) * scale`.
+    """
+    def call(self, inputs):
+        dtype = self.compute_dtype
+        scale = self.backend.cast(self.scale, dtype)
+        offset = self.backend.cast(self.offset, dtype)
+        return (self.backend.cast(inputs, dtype) - offset) * scale
+class LatentRescaling(layers.Rescaling):
+    """Rescales inputs from latent space to image space.
+    The rescaling is performed using the formula: `inputs / scale + offset`.
+    """
+    def call(self, inputs):
+        dtype = self.compute_dtype
+        scale = self.backend.cast(self.scale, dtype)
+        offset = self.backend.cast(self.offset, dtype)
+        return (self.backend.cast(inputs, dtype) / scale) + offset
+class ClassifierFreeGuidanceConcatenate(layers.Layer):
     def call(
         self,
         latents,
@@ -69,20 +107,16 @@ class ClassifierFreeGuidanceConcatenate(layers.Layer):
         timestep,
     ):
         timestep = ops.broadcast_to(timestep, ops.shape(latents)[:1])
-        latents = ops.concatenate([latents, latents], axis=self.axis)
+        latents = ops.concatenate([latents, latents], axis=0)
         contexts = ops.concatenate(
-            [positive_contexts, negative_contexts], axis=self.axis
+            [positive_contexts, negative_contexts], axis=0
         )
         pooled_projections = ops.concatenate(
-            [positive_pooled_projections, negative_pooled_projections],
-            axis=self.axis,
+            [positive_pooled_projections, negative_pooled_projections], axis=0
         )
-        timesteps = ops.concatenate([timestep, timestep], axis=self.axis)
+        timesteps = ops.concatenate([timestep, timestep], axis=0)
         return latents, contexts, pooled_projections, timesteps
-    def get_config(self):
-        return super().get_config()
 class ClassifierFreeGuidance(layers.Layer):
     """Perform classifier free guidance.
@@ -103,9 +137,6 @@ class ClassifierFreeGuidance(layers.Layer):
     - [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
     def call(self, inputs, guidance_scale):
         positive_noise, negative_noise = ops.split(inputs, 2, axis=0)
         return ops.add(
@@ -115,9 +146,6 @@ class ClassifierFreeGuidance(layers.Layer):
             ),
         )
-    def get_config(self):
-        return super().get_config()
     def compute_output_shape(self, inputs_shape):
         outputs_shape = list(inputs_shape)
         if outputs_shape[0] is not None:
@@ -145,58 +173,10 @@ class EulerStep(layers.Layer):
     https://arxiv.org/abs/2206.00364).
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
     def call(self, latents, noise_residual, sigma, sigma_next):
         sigma_diff = ops.subtract(sigma_next, sigma)
         return ops.add(latents, ops.multiply(sigma_diff, noise_residual))
-    def get_config(self):
-        return super().get_config()
-    def compute_output_shape(self, latents_shape):
-        return latents_shape
-class LatentSpaceDecoder(layers.Layer):
-    """Decoder to transform the latent space back to the original image space.
-    During decoding, the latents are transformed back to the original image
-    space using the equation: `latents / scale + shift`.
-    Args:
-        scale: float. The scaling factor.
-        shift: float. The shift factor.
-        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
-            including `name`, `dtype` etc.
-    Call arguments:
-        latents: The latent tensor to be transformed.
-    Reference:
-    - [High-Resolution Image Synthesis with Latent Diffusion Models](
-    https://arxiv.org/abs/2112.10752).
-    """
-    def __init__(self, scale, shift, **kwargs):
-        super().__init__(**kwargs)
-        self.scale = scale
-        self.shift = shift
-    def call(self, latents):
-        return ops.add(ops.divide(latents, self.scale), self.shift)
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "scale": self.scale,
-                "shift": self.shift,
-            }
-        )
-        return config
     def compute_output_shape(self, latents_shape):
         return latents_shape
@@ -222,16 +202,18 @@ class StableDiffusion3Backbone(Backbone):
             transformer in MMDiT.
         mmdit_position_size: int. The size of the height and width for the
             position embedding in MMDiT.
-        vae_stackwise_num_filters: list of ints. The number of filters for each
-            stack in VAE.
-        vae_stackwise_num_blocks: list of ints. The number of blocks for each
-            stack in VAE.
-        clip_l: `keras_hub.models.CLIPTextEncoder`. The text encoder for
-            encoding the inputs.
-        clip_g: `keras_hub.models.CLIPTextEncoder`. The text encoder for
-            encoding the inputs.
-        t5: optional `keras_hub.models.T5Encoder`. The text encoder for
-            encoding the inputs.
+        mmdit_qk_norm: Optional str. Whether to normalize the query and key
+            tensors for each transformer in MMDiT. Available options are `None`
+            and `"rms_norm"`. Typically, this is set to `None` for 3.0 version
+            and to `"rms_norm"` for 3.5 version.
+        mmdit_dual_attention_indices: Optional tuple. Specifies the indices of
+            the blocks that serve as dual attention blocks. Typically, this is
+            for 3.5 version. Defaults to `None`.
+        vae: The VAE used for transformations between pixel space and latent
+            space.
+        clip_l: The CLIP text encoder for encoding the inputs.
+        clip_g: The CLIP text encoder for encoding the inputs.
+        t5: optional The T5 text encoder for encoding the inputs.
         latent_channels: int. The number of channels in the latent. Defaults to
             `16`.
         output_channels: int. The number of channels in the output. Defaults to
@@ -239,9 +221,9 @@ class StableDiffusion3Backbone(Backbone):
         num_train_timesteps: int. The number of diffusion steps to train the
             model. Defaults to `1000`.
         shift: float. The shift value for the timestep schedule. Defaults to
-            `1.0`.
-        height: optional int. The output height of the image.
-        width: optional int. The output width of the image.
+            `3.0`.
+        image_shape: tuple. The input shape without the batch size. Defaults to
+            `(1024, 1024, 3)`.
         data_format: `None` or str. If specified, either `"channels_last"` or
             `"channels_first"`. The ordering of the dimensions in the
             inputs. `"channels_last"` corresponds to inputs with shape
@@ -264,6 +246,7 @@ class StableDiffusion3Backbone(Backbone):
     )
     # Randomly initialized Stable Diffusion 3 model with custom config.
+    vae = keras_hub.models.VAEBackbone(...)
     clip_l = keras_hub.models.CLIPTextEncoder(...)
     clip_g = keras_hub.models.CLIPTextEncoder(...)
     model = keras_hub.models.StableDiffusion3Backbone(
@@ -272,8 +255,9 @@ class StableDiffusion3Backbone(Backbone):
         mmdit_hidden_dim=256,
         mmdit_depth=4,
         mmdit_position_size=192,
-        vae_stackwise_num_filters=[128, 128, 64, 32],
-        vae_stackwise_num_blocks=[1, 1, 1, 1],
+        mmdit_qk_norm=None,
+        mmdit_dual_attention_indices=None,
+        vae=vae,
         clip_l=clip_l,
         clip_g=clip_g,
     )
@@ -287,46 +271,48 @@ class StableDiffusion3Backbone(Backbone):
         mmdit_num_layers,
         mmdit_num_heads,
         mmdit_position_size,
-        vae_stackwise_num_filters,
-        vae_stackwise_num_blocks,
+        mmdit_qk_norm,
+        mmdit_dual_attention_indices,
+        vae,
         clip_l,
         clip_g,
         t5=None,
         latent_channels=16,
         output_channels=3,
         num_train_timesteps=1000,
-        shift=1.0,
-        height=None,
-        width=None,
+        shift=3.0,
+        image_shape=(1024, 1024, 3),
         data_format=None,
         dtype=None,
         **kwargs,
     ):
-        height = int(height or 1024)
-        width = int(width or 1024)
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                "`height` and `width` must be divisible by 8. "
-                f"Received: height={height}, width={width}"
-            )
         data_format = standardize_data_format(data_format)
         if data_format != "channels_last":
             raise NotImplementedError
-        latent_shape = (height // 8, width // 8, latent_channels)
+        height = image_shape[0]
+        width = image_shape[1]
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                "height and width in `image_shape` must be divisible by 8. "
+                f"Received: image_shape={image_shape}"
+            )
+        latent_shape = (height // 8, width // 8, int(latent_channels))
         context_shape = (None, 4096 if t5 is None else t5.hidden_dim)
         pooled_projection_shape = (clip_l.hidden_dim + clip_g.hidden_dim,)
+        self._latent_shape = latent_shape
         # === Layers ===
         self.clip_l = clip_l
         self.clip_l_projection = CLIPProjection(
             clip_l.hidden_dim, dtype=dtype, name="clip_l_projection"
         )
-        self.clip_l_projection.build([None, clip_l.hidden_dim], None)
         self.clip_g = clip_g
         self.clip_g_projection = CLIPProjection(
             clip_g.hidden_dim, dtype=dtype, name="clip_g_projection"
         )
-        self.clip_g_projection.build([None, clip_g.hidden_dim], None)
+        self.clip_concatenate = CLIPConcatenate(
+            dtype=dtype, name="clip_concatenate"
+        )
         self.t5 = t5
         self.diffuser = MMDiT(
             mmdit_patch_size,
@@ -337,18 +323,18 @@ class StableDiffusion3Backbone(Backbone):
             latent_shape=latent_shape,
             context_shape=context_shape,
             pooled_projection_shape=pooled_projection_shape,
+            qk_norm=mmdit_qk_norm,
+            dual_attention_indices=mmdit_dual_attention_indices,
             data_format=data_format,
             dtype=dtype,
             name="diffuser",
         )
-        self.decoder = VAEImageDecoder(
-            vae_stackwise_num_filters,
-            vae_stackwise_num_blocks,
-            output_channels,
-            latent_shape=latent_shape,
-            data_format=data_format,
-            dtype=dtype,
-            name="decoder",
+        self.vae = vae
+        self.cfg_concat = ClassifierFreeGuidanceConcatenate(
+            dtype=dtype, name="classifier_free_guidance_concat"
+        )
+        self.cfg = ClassifierFreeGuidance(
+            dtype=dtype, name="classifier_free_guidance"
         )
         # Set `dtype="float32"` to ensure the high precision for the noise
         # residual.
@@ -358,21 +344,25 @@ class StableDiffusion3Backbone(Backbone):
             dtype="float32",
             name="scheduler",
         )
-        self.cfg_concat = ClassifierFreeGuidanceConcatenate(
-            dtype="float32", name="classifier_free_guidance_concat"
-        )
-        self.cfg = ClassifierFreeGuidance(
-            dtype="float32", name="classifier_free_guidance"
-        )
         self.euler_step = EulerStep(dtype="float32", name="euler_step")
-        self.latent_space_decoder = LatentSpaceDecoder(
-            scale=self.decoder.scaling_factor,
-            shift=self.decoder.shift_factor,
-            dtype="float32",
-            name="latent_space_decoder",
+        self.image_rescaling = ImageRescaling(
+            scale=self.vae.scale,
+            offset=self.vae.shift,
+            dtype=dtype,
+            name="image_rescaling",
+        )
+        self.latent_rescaling = LatentRescaling(
+            scale=self.vae.scale,
+            offset=self.vae.shift,
+            dtype=dtype,
+            name="latent_rescaling",
         )
         # === Functional Model ===
+        image_input = keras.Input(
+            shape=image_shape,
+            name="images",
+        )
         latent_input = keras.Input(
             shape=latent_shape,
             name="latents",
@@ -428,17 +418,19 @@ class StableDiffusion3Backbone(Backbone):
             dtype="float32",
             name="guidance_scale",
         )
-        embeddings = self.encode_step(token_ids, negative_token_ids)
+        embeddings = self.encode_text_step(token_ids, negative_token_ids)
+        latents = self.encode_image_step(image_input)
         # Use `steps=0` to define the functional model.
-        latents = self.denoise_step(
+        denoised_latents = self.denoise_step(
             latent_input,
             embeddings,
             0,
             num_step_input[0],
             guidance_scale_input[0],
         )
-        outputs = self.decode_step(latents)
+        images = self.decode_step(denoised_latents)
         inputs = {
+            "images": image_input,
             "latents": latent_input,
             "clip_l_token_ids": clip_l_token_id_input,
             "clip_l_negative_token_ids": clip_l_negative_token_id_input,
@@ -447,6 +439,10 @@ class StableDiffusion3Backbone(Backbone):
             "num_steps": num_step_input,
             "guidance_scale": guidance_scale_input,
         }
+        outputs = {
+            "latents": latents,
+            "images": images,
+        }
         if self.t5 is not None:
             inputs["t5_token_ids"] = t5_token_id_input
             inputs["t5_negative_token_ids"] = t5_negative_token_id_input
@@ -463,18 +459,17 @@ class StableDiffusion3Backbone(Backbone):
         self.mmdit_num_layers = mmdit_num_layers
         self.mmdit_num_heads = mmdit_num_heads
         self.mmdit_position_size = mmdit_position_size
-        self.vae_stackwise_num_filters = vae_stackwise_num_filters
-        self.vae_stackwise_num_blocks = vae_stackwise_num_blocks
+        self.mmdit_qk_norm = mmdit_qk_norm
+        self.mmdit_dual_attention_indices = mmdit_dual_attention_indices
         self.latent_channels = latent_channels
         self.output_channels = output_channels
         self.num_train_timesteps = num_train_timesteps
         self.shift = shift
-        self.height = height
-        self.width = width
+        self.image_shape = image_shape
     @property
     def latent_shape(self):
-        return (None,) + tuple(self.diffuser.latent_shape)
+        return (None,) + self._latent_shape
     @property
     def clip_hidden_dim(self):
@@ -484,13 +479,17 @@ class StableDiffusion3Backbone(Backbone):
     def t5_hidden_dim(self):
         return 4096 if self.t5 is None else self.t5.hidden_dim
-    def encode_step(self, token_ids, negative_token_ids):
+    def encode_text_step(self, token_ids, negative_token_ids):
         clip_hidden_dim = self.clip_hidden_dim
         t5_hidden_dim = self.t5_hidden_dim
         def encode(token_ids):
-            clip_l_outputs = self.clip_l(token_ids["clip_l"], training=False)
-            clip_g_outputs = self.clip_g(token_ids["clip_g"], training=False)
+            clip_l_outputs = self.clip_l(
+                {"token_ids": token_ids["clip_l"]}, training=False
+            )
+            clip_g_outputs = self.clip_g(
+                {"token_ids": token_ids["clip_g"]}, training=False
+            )
             clip_l_projection = self.clip_l_projection(
                 clip_l_outputs["sequence_output"],
                 token_ids["clip_l"],
@@ -501,23 +500,21 @@ class StableDiffusion3Backbone(Backbone):
                 token_ids["clip_g"],
                 training=False,
             )
-            pooled_embeddings = ops.concatenate(
-                [clip_l_projection, clip_g_projection],
-                axis=-1,
-            )
-            embeddings = ops.concatenate(
-                [
-                    clip_l_outputs["intermediate_output"],
-                    clip_g_outputs["intermediate_output"],
-                ],
-                axis=-1,
-            )
-            embeddings = ops.pad(
-                embeddings,
-                [[0, 0], [0, 0], [0, t5_hidden_dim - clip_hidden_dim]],
+            pooled_embeddings, embeddings = self.clip_concatenate(
+                clip_l_projection,
+                clip_g_projection,
+                clip_l_outputs["intermediate_output"],
+                clip_g_outputs["intermediate_output"],
+                padding=t5_hidden_dim - clip_hidden_dim,
             )
             if self.t5 is not None:
-                t5_outputs = self.t5(token_ids["t5"], training=False)
+                t5_outputs = self.t5(
+                    {
+                        "token_ids": token_ids["t5"],
+                        "padding_mask": ops.ones_like(token_ids["t5"]),
+                    },
+                    training=False,
+                )
                 embeddings = ops.concatenate([embeddings, t5_outputs], axis=-2)
             else:
                 padded_size = self.clip_l.max_sequence_length
@@ -537,23 +534,36 @@ class StableDiffusion3Backbone(Backbone):
             negative_pooled_embeddings,
         )
+    def encode_image_step(self, images):
+        latents = self.vae.encode(images)
+        return self.image_rescaling(latents)
+    def add_noise_step(self, latents, noises, step, num_steps):
+        return self.scheduler.add_noise(latents, noises, step, num_steps)
     def denoise_step(
         self,
         latents,
         embeddings,
-        steps,
+        step,
         num_steps,
-        guidance_scale,
+        guidance_scale=None,
     ):
-        steps = ops.convert_to_tensor(steps)
-        steps_next = ops.add(steps, 1)
-        sigma, timestep = self.scheduler(steps, num_steps)
-        sigma_next, _ = self.scheduler(steps_next, num_steps)
+        step = ops.convert_to_tensor(step)
+        next_step = ops.add(step, 1)
+        sigma, timestep = self.scheduler(step, num_steps)
+        next_sigma, _ = self.scheduler(next_step, num_steps)
         # Concatenation for classifier-free guidance.
-        concated_latents, contexts, pooled_projs, timesteps = self.cfg_concat(
-            latents, *embeddings, timestep
-        )
+        if guidance_scale is not None:
+            concated_latents, contexts, pooled_projs, timesteps = (
+                self.cfg_concat(latents, *embeddings, timestep)
+            )
+        else:
+            timesteps = ops.broadcast_to(timestep, ops.shape(latents)[:1])
+            concated_latents = latents
+            contexts = embeddings[0]
+            pooled_projs = embeddings[2]
         # Diffusion.
         predicted_noise = self.diffuser(
@@ -567,14 +577,15 @@ class StableDiffusion3Backbone(Backbone):
         )
         # Classifier-free guidance.
-        predicted_noise = self.cfg(predicted_noise, guidance_scale)
+        if guidance_scale is not None:
+            predicted_noise = self.cfg(predicted_noise, guidance_scale)
         # Euler step.
-        return self.euler_step(latents, predicted_noise, sigma, sigma_next)
+        return self.euler_step(latents, predicted_noise, sigma, next_sigma)
     def decode_step(self, latents):
-        latents = self.latent_space_decoder(latents)
-        return self.decoder(latents, training=False)
+        latents = self.latent_rescaling(latents)
+        return self.vae.decode(latents, training=False)
     def get_config(self):
         config = super().get_config()
@@ -585,8 +596,11 @@ class StableDiffusion3Backbone(Backbone):
                 "mmdit_num_layers": self.mmdit_num_layers,
                 "mmdit_num_heads": self.mmdit_num_heads,
                 "mmdit_position_size": self.mmdit_position_size,
-                "vae_stackwise_num_filters": self.vae_stackwise_num_filters,
-                "vae_stackwise_num_blocks": self.vae_stackwise_num_blocks,
+                "mmdit_qk_norm": self.mmdit_qk_norm,
+                "mmdit_dual_attention_indices": (
+                    self.mmdit_dual_attention_indices
+                ),
+                "vae": layers.serialize(self.vae),
                 "clip_l": layers.serialize(self.clip_l),
                 "clip_g": layers.serialize(self.clip_g),
                 "t5": layers.serialize(self.t5),
@@ -594,8 +608,7 @@ class StableDiffusion3Backbone(Backbone):
                 "output_channels": self.output_channels,
                 "num_train_timesteps": self.num_train_timesteps,
                 "shift": self.shift,
-                "height": self.height,
-                "width": self.width,
+                "image_shape": self.image_shape,
             }
         )
         return config
@@ -607,6 +620,8 @@ class StableDiffusion3Backbone(Backbone):
         # Propagate `dtype` to text encoders if needed.
         if "dtype" in config and config["dtype"] is not None:
             dtype_config = config["dtype"]
+            if "dtype" not in config["vae"]["config"]:
+                config["vae"]["config"]["dtype"] = dtype_config
             if "dtype" not in config["clip_l"]["config"]:
                 config["clip_l"]["config"]["dtype"] = dtype_config
             if "dtype" not in config["clip_g"]["config"]:
@@ -617,7 +632,10 @@ class StableDiffusion3Backbone(Backbone):
             ):
                 config["t5"]["config"]["dtype"] = dtype_config
-        # We expect `clip_l`, `clip_g` and/or `t5` to be instantiated.
+        # We expect `vae`, `clip_l`, `clip_g` and/or `t5` to be instantiated.
+        config["vae"] = layers.deserialize(
+            config["vae"], custom_objects=custom_objects
+        )
         config["clip_l"] = layers.deserialize(
             config["clip_l"], custom_objects=custom_objects
         )
@@ -628,4 +646,12 @@ class StableDiffusion3Backbone(Backbone):
             config["t5"] = layers.deserialize(
                 config["t5"], custom_objects=custom_objects
             )
+        # To maintain backward compatibility, we need to ensure that
+        # `mmdit_qk_norm` and `mmdit_dual_attention_indices` is included in the
+        # config.
+        if "mmdit_qk_norm" not in config:
+            config["mmdit_qk_norm"] = None
+        if "mmdit_dual_attention_indices" not in config:
+            config["mmdit_dual_attention_indices"] = None
         return cls(**config)

keras-hub-nightly 0.16.1.dev202410020340__py3-none-any.whl → 0.19.0.dev202501260345__py3-none-any.whl

keras-hub-nightly 0.16.1.dev202410020340py3-none-any.whl → 0.19.0.dev202501260345py3-none-any.whl