PyPI - keras-hub-nightly - Versions diffs - 0.16.1.dev202410030339__py3-none-any.whl → 0.16.1.dev202410050339__py3-none-any.whl - Mend

keras-hub-nightly 0.16.1.dev202410030339py3-none-any.whl → 0.16.1.dev202410050339py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py CHANGED Viewed

@@ -8,9 +8,6 @@ from keras_hub.src.models.stable_diffusion_3.flow_match_euler_discrete_scheduler
     FlowMatchEulerDiscreteScheduler,
 )
 from keras_hub.src.models.stable_diffusion_3.mmdit import MMDiT
-from keras_hub.src.models.stable_diffusion_3.vae_image_decoder import (
-    VAEImageDecoder,
-)
 from keras_hub.src.utils.keras_utils import standardize_data_format
@@ -159,48 +156,6 @@ class EulerStep(layers.Layer):
         return latents_shape
-class LatentSpaceDecoder(layers.Layer):
-    """Decoder to transform the latent space back to the original image space.
-    During decoding, the latents are transformed back to the original image
-    space using the equation: `latents / scale + shift`.
-    Args:
-        scale: float. The scaling factor.
-        shift: float. The shift factor.
-        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
-            including `name`, `dtype` etc.
-    Call arguments:
-        latents: The latent tensor to be transformed.
-    Reference:
-    - [High-Resolution Image Synthesis with Latent Diffusion Models](
-    https://arxiv.org/abs/2112.10752).
-    """
-    def __init__(self, scale, shift, **kwargs):
-        super().__init__(**kwargs)
-        self.scale = scale
-        self.shift = shift
-    def call(self, latents):
-        return ops.add(ops.divide(latents, self.scale), self.shift)
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "scale": self.scale,
-                "shift": self.shift,
-            }
-        )
-        return config
-    def compute_output_shape(self, latents_shape):
-        return latents_shape
 @keras_hub_export("keras_hub.models.StableDiffusion3Backbone")
 class StableDiffusion3Backbone(Backbone):
     """Stable Diffusion 3 core network with hyperparameters.
@@ -222,16 +177,11 @@ class StableDiffusion3Backbone(Backbone):
             transformer in MMDiT.
         mmdit_position_size: int. The size of the height and width for the
             position embedding in MMDiT.
-        vae_stackwise_num_filters: list of ints. The number of filters for each
-            stack in VAE.
-        vae_stackwise_num_blocks: list of ints. The number of blocks for each
-            stack in VAE.
-        clip_l: `keras_hub.models.CLIPTextEncoder`. The text encoder for
-            encoding the inputs.
-        clip_g: `keras_hub.models.CLIPTextEncoder`. The text encoder for
-            encoding the inputs.
-        t5: optional `keras_hub.models.T5Encoder`. The text encoder for
-            encoding the inputs.
+        vae: The VAE used for transformations between pixel space and latent
+            space.
+        clip_l: The CLIP text encoder for encoding the inputs.
+        clip_g: The CLIP text encoder for encoding the inputs.
+        t5: optional The T5 text encoder for encoding the inputs.
         latent_channels: int. The number of channels in the latent. Defaults to
             `16`.
         output_channels: int. The number of channels in the output. Defaults to
@@ -239,7 +189,7 @@ class StableDiffusion3Backbone(Backbone):
         num_train_timesteps: int. The number of diffusion steps to train the
             model. Defaults to `1000`.
         shift: float. The shift value for the timestep schedule. Defaults to
-            `1.0`.
+            `3.0`.
         height: optional int. The output height of the image.
         width: optional int. The output width of the image.
         data_format: `None` or str. If specified, either `"channels_last"` or
@@ -264,6 +214,7 @@ class StableDiffusion3Backbone(Backbone):
     )
     # Randomly initialized Stable Diffusion 3 model with custom config.
+    vae = keras_hub.models.VAEBackbone(...)
     clip_l = keras_hub.models.CLIPTextEncoder(...)
     clip_g = keras_hub.models.CLIPTextEncoder(...)
     model = keras_hub.models.StableDiffusion3Backbone(
@@ -272,8 +223,7 @@ class StableDiffusion3Backbone(Backbone):
         mmdit_hidden_dim=256,
         mmdit_depth=4,
         mmdit_position_size=192,
-        vae_stackwise_num_filters=[128, 128, 64, 32],
-        vae_stackwise_num_blocks=[1, 1, 1, 1],
+        vae=vae,
         clip_l=clip_l,
         clip_g=clip_g,
     )
@@ -287,15 +237,14 @@ class StableDiffusion3Backbone(Backbone):
         mmdit_num_layers,
         mmdit_num_heads,
         mmdit_position_size,
-        vae_stackwise_num_filters,
-        vae_stackwise_num_blocks,
+        vae,
         clip_l,
         clip_g,
         t5=None,
         latent_channels=16,
         output_channels=3,
         num_train_timesteps=1000,
-        shift=1.0,
+        shift=3.0,
         height=None,
         width=None,
         data_format=None,
@@ -312,9 +261,11 @@ class StableDiffusion3Backbone(Backbone):
         data_format = standardize_data_format(data_format)
         if data_format != "channels_last":
             raise NotImplementedError
-        latent_shape = (height // 8, width // 8, latent_channels)
+        image_shape = (height, width, int(vae.input_channels))
+        latent_shape = (height // 8, width // 8, int(latent_channels))
         context_shape = (None, 4096 if t5 is None else t5.hidden_dim)
         pooled_projection_shape = (clip_l.hidden_dim + clip_g.hidden_dim,)
+        self._latent_shape = latent_shape
         # === Layers ===
         self.clip_l = clip_l
@@ -341,15 +292,7 @@ class StableDiffusion3Backbone(Backbone):
             dtype=dtype,
             name="diffuser",
         )
-        self.decoder = VAEImageDecoder(
-            vae_stackwise_num_filters,
-            vae_stackwise_num_blocks,
-            output_channels,
-            latent_shape=latent_shape,
-            data_format=data_format,
-            dtype=dtype,
-            name="decoder",
-        )
+        self.vae = vae
         # Set `dtype="float32"` to ensure the high precision for the noise
         # residual.
         self.scheduler = FlowMatchEulerDiscreteScheduler(
@@ -365,14 +308,18 @@ class StableDiffusion3Backbone(Backbone):
             dtype="float32", name="classifier_free_guidance"
         )
         self.euler_step = EulerStep(dtype="float32", name="euler_step")
-        self.latent_space_decoder = LatentSpaceDecoder(
-            scale=self.decoder.scaling_factor,
-            shift=self.decoder.shift_factor,
+        self.latent_rescaling = layers.Rescaling(
+            scale=1.0 / self.vae.scale,
+            offset=self.vae.shift,
             dtype="float32",
-            name="latent_space_decoder",
+            name="latent_rescaling",
         )
         # === Functional Model ===
+        image_input = keras.Input(
+            shape=image_shape,
+            name="images",
+        )
         latent_input = keras.Input(
             shape=latent_shape,
             name="latents",
@@ -428,17 +375,19 @@ class StableDiffusion3Backbone(Backbone):
             dtype="float32",
             name="guidance_scale",
         )
-        embeddings = self.encode_step(token_ids, negative_token_ids)
+        embeddings = self.encode_text_step(token_ids, negative_token_ids)
+        latents = self.encode_image_step(image_input)
         # Use `steps=0` to define the functional model.
-        latents = self.denoise_step(
+        denoised_latents = self.denoise_step(
             latent_input,
             embeddings,
             0,
             num_step_input[0],
             guidance_scale_input[0],
         )
-        outputs = self.decode_step(latents)
+        images = self.decode_step(denoised_latents)
         inputs = {
+            "images": image_input,
             "latents": latent_input,
             "clip_l_token_ids": clip_l_token_id_input,
             "clip_l_negative_token_ids": clip_l_negative_token_id_input,
@@ -447,6 +396,10 @@ class StableDiffusion3Backbone(Backbone):
             "num_steps": num_step_input,
             "guidance_scale": guidance_scale_input,
         }
+        outputs = {
+            "latents": latents,
+            "images": images,
+        }
         if self.t5 is not None:
             inputs["t5_token_ids"] = t5_token_id_input
             inputs["t5_negative_token_ids"] = t5_negative_token_id_input
@@ -463,8 +416,6 @@ class StableDiffusion3Backbone(Backbone):
         self.mmdit_num_layers = mmdit_num_layers
         self.mmdit_num_heads = mmdit_num_heads
         self.mmdit_position_size = mmdit_position_size
-        self.vae_stackwise_num_filters = vae_stackwise_num_filters
-        self.vae_stackwise_num_blocks = vae_stackwise_num_blocks
         self.latent_channels = latent_channels
         self.output_channels = output_channels
         self.num_train_timesteps = num_train_timesteps
@@ -474,7 +425,7 @@ class StableDiffusion3Backbone(Backbone):
     @property
     def latent_shape(self):
-        return (None,) + tuple(self.diffuser.latent_shape)
+        return (None,) + self._latent_shape
     @property
     def clip_hidden_dim(self):
@@ -484,7 +435,7 @@ class StableDiffusion3Backbone(Backbone):
     def t5_hidden_dim(self):
         return 4096 if self.t5 is None else self.t5.hidden_dim
-    def encode_step(self, token_ids, negative_token_ids):
+    def encode_text_step(self, token_ids, negative_token_ids):
         clip_hidden_dim = self.clip_hidden_dim
         t5_hidden_dim = self.t5_hidden_dim
@@ -537,18 +488,27 @@ class StableDiffusion3Backbone(Backbone):
             negative_pooled_embeddings,
         )
+    def encode_image_step(self, images):
+        latents = self.vae.encode(images)
+        return ops.multiply(
+            ops.subtract(latents, self.vae.shift), self.vae.scale
+        )
+    def add_noise_step(self, latents, noises, step, num_steps):
+        return self.scheduler.add_noise(latents, noises, step, num_steps)
     def denoise_step(
         self,
         latents,
         embeddings,
-        steps,
+        step,
         num_steps,
         guidance_scale,
     ):
-        steps = ops.convert_to_tensor(steps)
-        steps_next = ops.add(steps, 1)
-        sigma, timestep = self.scheduler(steps, num_steps)
-        sigma_next, _ = self.scheduler(steps_next, num_steps)
+        step = ops.convert_to_tensor(step)
+        next_step = ops.add(step, 1)
+        sigma, timestep = self.scheduler(step, num_steps)
+        next_sigma, _ = self.scheduler(next_step, num_steps)
         # Concatenation for classifier-free guidance.
         concated_latents, contexts, pooled_projs, timesteps = self.cfg_concat(
@@ -570,11 +530,11 @@ class StableDiffusion3Backbone(Backbone):
         predicted_noise = self.cfg(predicted_noise, guidance_scale)
         # Euler step.
-        return self.euler_step(latents, predicted_noise, sigma, sigma_next)
+        return self.euler_step(latents, predicted_noise, sigma, next_sigma)
     def decode_step(self, latents):
-        latents = self.latent_space_decoder(latents)
-        return self.decoder(latents, training=False)
+        latents = self.latent_rescaling(latents)
+        return self.vae.decode(latents, training=False)
     def get_config(self):
         config = super().get_config()
@@ -585,8 +545,7 @@ class StableDiffusion3Backbone(Backbone):
                 "mmdit_num_layers": self.mmdit_num_layers,
                 "mmdit_num_heads": self.mmdit_num_heads,
                 "mmdit_position_size": self.mmdit_position_size,
-                "vae_stackwise_num_filters": self.vae_stackwise_num_filters,
-                "vae_stackwise_num_blocks": self.vae_stackwise_num_blocks,
+                "vae": layers.serialize(self.vae),
                 "clip_l": layers.serialize(self.clip_l),
                 "clip_g": layers.serialize(self.clip_g),
                 "t5": layers.serialize(self.t5),
@@ -607,6 +566,8 @@ class StableDiffusion3Backbone(Backbone):
         # Propagate `dtype` to text encoders if needed.
         if "dtype" in config and config["dtype"] is not None:
             dtype_config = config["dtype"]
+            if "dtype" not in config["vae"]["config"]:
+                config["vae"]["config"]["dtype"] = dtype_config
             if "dtype" not in config["clip_l"]["config"]:
                 config["clip_l"]["config"]["dtype"] = dtype_config
             if "dtype" not in config["clip_g"]["config"]:
@@ -617,7 +578,10 @@ class StableDiffusion3Backbone(Backbone):
             ):
                 config["t5"]["config"]["dtype"] = dtype_config
-        # We expect `clip_l`, `clip_g` and/or `t5` to be instantiated.
+        # We expect `vae`, `clip_l`, `clip_g` and/or `t5` to be instantiated.
+        config["vae"] = layers.deserialize(
+            config["vae"], custom_objects=custom_objects
+        )
         config["clip_l"] = layers.deserialize(
             config["clip_l"], custom_objects=custom_objects
         )

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py CHANGED Viewed

@@ -5,14 +5,14 @@ backbone_presets = {
         "metadata": {
             "description": (
                 "3 billion parameter, including CLIP L and CLIP G text "
-                "encoders, MMDiT generative model, and VAE decoder. "
+                "encoders, MMDiT generative model, and VAE autoencoder. "
                 "Developed by Stability AI."
             ),
-            "params": 2952806723,
+            "params": 2987080931,
             "official_name": "StableDiffusion3",
             "path": "stablediffusion3",
             "model_card": "https://arxiv.org/abs/2110.00476",
         },
-        "kaggle_handle": "kaggle://kerashub/stablediffusion3/keras/stable_diffusion_3_medium/1",
+        "kaggle_handle": "kaggle://kerashub/stablediffusion3/keras/stable_diffusion_3_medium/3",
     }
 }

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py CHANGED Viewed

@@ -38,11 +38,11 @@ class StableDiffusion3TextToImage(TextToImage):
         ["cute wallpaper art of a cat", "cute wallpaper art of a dog"]
     )
-    # Generate with different `num_steps` and `classifier_free_guidance_scale`.
+    # Generate with different `num_steps` and `guidance_scale`.
     text_to_image.generate(
         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
         num_steps=50,
-        classifier_free_guidance_scale=5.0,
+        guidance_scale=5.0,
     )
     ```
     """
@@ -104,7 +104,9 @@ class StableDiffusion3TextToImage(TextToImage):
                 the expense of lower image quality.
         """
         # Encode inputs.
-        embeddings = self.backbone.encode_step(token_ids, negative_token_ids)
+        embeddings = self.backbone.encode_text_step(
+            token_ids, negative_token_ids
+        )
         # Denoise.
         def body_fun(step, latents):

keras_hub/src/models/task.py CHANGED Viewed

@@ -4,8 +4,11 @@ from rich import markup
 from rich import table as rich_table
 from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.layers.preprocessing.audio_converter import AudioConverter
+from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.models.preprocessor import Preprocessor
+from keras_hub.src.tokenizers.tokenizer import Tokenizer
 from keras_hub.src.utils.keras_utils import print_msg
 from keras_hub.src.utils.pipeline_model import PipelineModel
 from keras_hub.src.utils.preset_utils import builtin_presets
@@ -324,22 +327,24 @@ class Task(PipelineModel):
                     info,
                 )
+            # Since the preprocessor might be nested with multiple `Tokenizer`,
+            # `ImageConverter`, `AudioConverter` and even other `Preprocessor`
+            # instances, we should recursively iterate through them.
             preprocessor = self.preprocessor
-            tokenizer = getattr(preprocessor, "tokenizer", None)
-            if tokenizer:
-                info = "Vocab size: "
-                info += highlight_number(tokenizer.vocabulary_size())
-                add_layer(tokenizer, info)
-            image_converter = getattr(preprocessor, "image_converter", None)
-            if image_converter:
-                info = "Image size: "
-                info += highlight_shape(image_converter.image_size)
-                add_layer(image_converter, info)
-            audio_converter = getattr(preprocessor, "audio_converter", None)
-            if audio_converter:
-                info = "Audio shape: "
-                info += highlight_shape(audio_converter.audio_shape())
-                add_layer(audio_converter, info)
+            if preprocessor and isinstance(preprocessor, keras.Layer):
+                for layer in preprocessor._flatten_layers(include_self=False):
+                    if isinstance(layer, Tokenizer):
+                        info = "Vocab size: "
+                        info += highlight_number(layer.vocabulary_size())
+                        add_layer(layer, info)
+                    elif isinstance(layer, ImageConverter):
+                        info = "Image size: "
+                        info += highlight_shape(layer.image_size())
+                        add_layer(layer, info)
+                    elif isinstance(layer, AudioConverter):
+                        info = "Audio shape: "
+                        info += highlight_shape(layer.audio_shape())
+                        add_layer(layer, info)
             # Print the to the console.
             preprocessor_name = markup.escape(preprocessor.name)

keras_hub/src/models/vae/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from keras_hub.src.models.vae.vae_backbone import VAEBackbone

keras_hub/src/models/vae/vae_backbone.py ADDED Viewed

@@ -0,0 +1,172 @@
+import keras
+from keras_hub.src.models.backbone import Backbone
+from keras_hub.src.models.vae.vae_layers import (
+    DiagonalGaussianDistributionSampler,
+)
+from keras_hub.src.models.vae.vae_layers import VAEDecoder
+from keras_hub.src.models.vae.vae_layers import VAEEncoder
+from keras_hub.src.utils.keras_utils import standardize_data_format
+class VAEBackbone(Backbone):
+    """VAE backbone used in latent diffusion models.
+    When encoding, this model generates mean and log variance of the input
+    images. When decoding, it reconstructs images from the latent space.
+    Args:
+        encoder_num_filters: list of ints. The number of filters for each
+            block in encoder.
+        encoder_num_blocks: list of ints. The number of blocks for each block in
+            encoder.
+        decoder_num_filters: list of ints. The number of filters for each
+            block in decoder.
+        decoder_num_blocks: list of ints. The number of blocks for each block in
+            decoder.
+        sampler_method: str. The method of the sampler for the intermediate
+            output. Available methods are `"sample"` and `"mode"`. `"sample"`
+            draws from the distribution using both the mean and log variance.
+            `"mode"` draws from the distribution using the mean only. Defaults
+            to `sample`.
+        input_channels: int. The number of channels in the input.
+        sample_channels: int. The number of channels in the sample. Typically,
+            this indicates the intermediate output of VAE, which is mean and
+            log variance.
+        output_channels: int. The number of channels in the output.
+        scale: float. The scaling factor applied to the latent space to ensure
+            it has unit variance during training of the diffusion model.
+            Defaults to `1.5305`, which is the value used in Stable Diffusion 3.
+        shift: float. The shift factor applied to the latent space to ensure it
+            has zero mean during training of the diffusion model. Defaults to
+            `0.0609`, which is the value used in Stable Diffusion 3.
+        data_format: `None` or str. If specified, either `"channels_last"` or
+            `"channels_first"`. The ordering of the dimensions in the
+            inputs. `"channels_last"` corresponds to inputs with shape
+            `(batch_size, height, width, channels)`
+            while `"channels_first"` corresponds to inputs with shape
+            `(batch_size, channels, height, width)`. It defaults to the
+            `image_data_format` value found in your Keras config file at
+            `~/.keras/keras.json`. If you never set it, then it will be
+            `"channels_last"`.
+        dtype: `None` or str or `keras.mixed_precision.DTypePolicy`. The dtype
+            to use for the model's computations and weights.
+    """
+    def __init__(
+        self,
+        encoder_num_filters,
+        encoder_num_blocks,
+        decoder_num_filters,
+        decoder_num_blocks,
+        sampler_method="sample",
+        input_channels=3,
+        sample_channels=32,
+        output_channels=3,
+        scale=1.5305,
+        shift=0.0609,
+        data_format=None,
+        dtype=None,
+        **kwargs,
+    ):
+        data_format = standardize_data_format(data_format)
+        if data_format == "channels_last":
+            image_shape = (None, None, input_channels)
+            channel_axis = -1
+        else:
+            image_shape = (input_channels, None, None)
+            channel_axis = 1
+        # === Layers ===
+        self.encoder = VAEEncoder(
+            encoder_num_filters,
+            encoder_num_blocks,
+            output_channels=sample_channels,
+            data_format=data_format,
+            dtype=dtype,
+            name="encoder",
+        )
+        # Use `sample()` to define the functional model.
+        self.distribution_sampler = DiagonalGaussianDistributionSampler(
+            method=sampler_method,
+            axis=channel_axis,
+            dtype=dtype,
+            name="distribution_sampler",
+        )
+        self.decoder = VAEDecoder(
+            decoder_num_filters,
+            decoder_num_blocks,
+            output_channels=output_channels,
+            data_format=data_format,
+            dtype=dtype,
+            name="decoder",
+        )
+        # === Functional Model ===
+        image_input = keras.Input(shape=image_shape)
+        sample = self.encoder(image_input)
+        latent = self.distribution_sampler(sample)
+        image_output = self.decoder(latent)
+        super().__init__(
+            inputs=image_input,
+            outputs=image_output,
+            dtype=dtype,
+            **kwargs,
+        )
+        # === Config ===
+        self.encoder_num_filters = encoder_num_filters
+        self.encoder_num_blocks = encoder_num_blocks
+        self.decoder_num_filters = decoder_num_filters
+        self.decoder_num_blocks = decoder_num_blocks
+        self.sampler_method = sampler_method
+        self.input_channels = input_channels
+        self.sample_channels = sample_channels
+        self.output_channels = output_channels
+        self._scale = scale
+        self._shift = shift
+    @property
+    def scale(self):
+        """The scaling factor for the latent space.
+        This is used to scale the latent space to have unit variance when
+        training the diffusion model.
+        """
+        return self._scale
+    @property
+    def shift(self):
+        """The shift factor for the latent space.
+        This is used to shift the latent space to have zero mean when
+        training the diffusion model.
+        """
+        return self._shift
+    def encode(self, inputs, **kwargs):
+        """Encode the input images into latent space."""
+        sample = self.encoder(inputs, **kwargs)
+        return self.distribution_sampler(sample)
+    def decode(self, inputs, **kwargs):
+        """Decode the input latent space into images."""
+        return self.decoder(inputs, **kwargs)
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "encoder_num_filters": self.encoder_num_filters,
+                "encoder_num_blocks": self.encoder_num_blocks,
+                "decoder_num_filters": self.decoder_num_filters,
+                "decoder_num_blocks": self.decoder_num_blocks,
+                "sampler_method": self.sampler_method,
+                "input_channels": self.input_channels,
+                "sample_channels": self.sample_channels,
+                "output_channels": self.output_channels,
+                "scale": self.scale,
+                "shift": self.shift,
+            }
+        )
+        return config

keras-hub-nightly 0.16.1.dev202410030339__py3-none-any.whl → 0.16.1.dev202410050339__py3-none-any.whl

keras-hub-nightly 0.16.1.dev202410030339py3-none-any.whl → 0.16.1.dev202410050339py3-none-any.whl