PyPI - keras-hub-nightly - Versions diffs - 0.16.1.dev202410080341__py3-none-any.whl → 0.16.1.dev202410100339__py3-none-any.whl - Mend

keras-hub-nightly 0.16.1.dev202410080341py3-none-any.whl → 0.16.1.dev202410100339py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py CHANGED Viewed

@@ -51,11 +51,52 @@ class CLIPProjection(layers.Layer):
         return (inputs_shape[0], self.hidden_dim)
-class ClassifierFreeGuidanceConcatenate(layers.Layer):
-    def __init__(self, axis=0, **kwargs):
-        super().__init__(**kwargs)
-        self.axis = axis
+class CLIPConcatenate(layers.Layer):
+    def call(
+        self,
+        clip_l_projection,
+        clip_g_projection,
+        clip_l_intermediate_output,
+        clip_g_intermediate_output,
+        padding,
+    ):
+        pooled_embeddings = ops.concatenate(
+            [clip_l_projection, clip_g_projection], axis=-1
+        )
+        embeddings = ops.concatenate(
+            [clip_l_intermediate_output, clip_g_intermediate_output], axis=-1
+        )
+        embeddings = ops.pad(embeddings, [[0, 0], [0, 0], [0, padding]])
+        return pooled_embeddings, embeddings
+class ImageRescaling(layers.Rescaling):
+    """Rescales inputs from image space to latent space.
+    The rescaling is performed using the formula: `(inputs - offset) * scale`.
+    """
+    def call(self, inputs):
+        dtype = self.compute_dtype
+        scale = self.backend.cast(self.scale, dtype)
+        offset = self.backend.cast(self.offset, dtype)
+        return (self.backend.cast(inputs, dtype) - offset) * scale
+class LatentRescaling(layers.Rescaling):
+    """Rescales inputs from latent space to image space.
+    The rescaling is performed using the formula: `inputs / scale + offset`.
+    """
+    def call(self, inputs):
+        dtype = self.compute_dtype
+        scale = self.backend.cast(self.scale, dtype)
+        offset = self.backend.cast(self.offset, dtype)
+        return (self.backend.cast(inputs, dtype) / scale) + offset
+class ClassifierFreeGuidanceConcatenate(layers.Layer):
     def call(
         self,
         latents,
@@ -66,20 +107,16 @@ class ClassifierFreeGuidanceConcatenate(layers.Layer):
         timestep,
     ):
         timestep = ops.broadcast_to(timestep, ops.shape(latents)[:1])
-        latents = ops.concatenate([latents, latents], axis=self.axis)
+        latents = ops.concatenate([latents, latents], axis=0)
         contexts = ops.concatenate(
-            [positive_contexts, negative_contexts], axis=self.axis
+            [positive_contexts, negative_contexts], axis=0
         )
         pooled_projections = ops.concatenate(
-            [positive_pooled_projections, negative_pooled_projections],
-            axis=self.axis,
+            [positive_pooled_projections, negative_pooled_projections], axis=0
         )
-        timesteps = ops.concatenate([timestep, timestep], axis=self.axis)
+        timesteps = ops.concatenate([timestep, timestep], axis=0)
         return latents, contexts, pooled_projections, timesteps
-    def get_config(self):
-        return super().get_config()
 class ClassifierFreeGuidance(layers.Layer):
     """Perform classifier free guidance.
@@ -100,9 +137,6 @@ class ClassifierFreeGuidance(layers.Layer):
     - [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
     def call(self, inputs, guidance_scale):
         positive_noise, negative_noise = ops.split(inputs, 2, axis=0)
         return ops.add(
@@ -112,9 +146,6 @@ class ClassifierFreeGuidance(layers.Layer):
             ),
         )
-    def get_config(self):
-        return super().get_config()
     def compute_output_shape(self, inputs_shape):
         outputs_shape = list(inputs_shape)
         if outputs_shape[0] is not None:
@@ -142,16 +173,10 @@ class EulerStep(layers.Layer):
     https://arxiv.org/abs/2206.00364).
     """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
     def call(self, latents, noise_residual, sigma, sigma_next):
         sigma_diff = ops.subtract(sigma_next, sigma)
         return ops.add(latents, ops.multiply(sigma_diff, noise_residual))
-    def get_config(self):
-        return super().get_config()
     def compute_output_shape(self, latents_shape):
         return latents_shape
@@ -272,12 +297,13 @@ class StableDiffusion3Backbone(Backbone):
         self.clip_l_projection = CLIPProjection(
             clip_l.hidden_dim, dtype=dtype, name="clip_l_projection"
         )
-        self.clip_l_projection.build([None, clip_l.hidden_dim], None)
         self.clip_g = clip_g
         self.clip_g_projection = CLIPProjection(
             clip_g.hidden_dim, dtype=dtype, name="clip_g_projection"
         )
-        self.clip_g_projection.build([None, clip_g.hidden_dim], None)
+        self.clip_concatenate = CLIPConcatenate(
+            dtype=dtype, name="clip_concatenate"
+        )
         self.t5 = t5
         self.diffuser = MMDiT(
             mmdit_patch_size,
@@ -293,6 +319,12 @@ class StableDiffusion3Backbone(Backbone):
             name="diffuser",
         )
         self.vae = vae
+        self.cfg_concat = ClassifierFreeGuidanceConcatenate(
+            dtype=dtype, name="classifier_free_guidance_concat"
+        )
+        self.cfg = ClassifierFreeGuidance(
+            dtype=dtype, name="classifier_free_guidance"
+        )
         # Set `dtype="float32"` to ensure the high precision for the noise
         # residual.
         self.scheduler = FlowMatchEulerDiscreteScheduler(
@@ -301,17 +333,17 @@ class StableDiffusion3Backbone(Backbone):
             dtype="float32",
             name="scheduler",
         )
-        self.cfg_concat = ClassifierFreeGuidanceConcatenate(
-            dtype="float32", name="classifier_free_guidance_concat"
-        )
-        self.cfg = ClassifierFreeGuidance(
-            dtype="float32", name="classifier_free_guidance"
-        )
         self.euler_step = EulerStep(dtype="float32", name="euler_step")
-        self.latent_rescaling = layers.Rescaling(
-            scale=1.0 / self.vae.scale,
+        self.image_rescaling = ImageRescaling(
+            scale=self.vae.scale,
             offset=self.vae.shift,
-            dtype="float32",
+            dtype=dtype,
+            name="image_rescaling",
+        )
+        self.latent_rescaling = LatentRescaling(
+            scale=self.vae.scale,
+            offset=self.vae.shift,
+            dtype=dtype,
             name="latent_rescaling",
         )
@@ -440,8 +472,12 @@ class StableDiffusion3Backbone(Backbone):
         t5_hidden_dim = self.t5_hidden_dim
         def encode(token_ids):
-            clip_l_outputs = self.clip_l(token_ids["clip_l"], training=False)
-            clip_g_outputs = self.clip_g(token_ids["clip_g"], training=False)
+            clip_l_outputs = self.clip_l(
+                {"token_ids": token_ids["clip_l"]}, training=False
+            )
+            clip_g_outputs = self.clip_g(
+                {"token_ids": token_ids["clip_g"]}, training=False
+            )
             clip_l_projection = self.clip_l_projection(
                 clip_l_outputs["sequence_output"],
                 token_ids["clip_l"],
@@ -452,23 +488,21 @@ class StableDiffusion3Backbone(Backbone):
                 token_ids["clip_g"],
                 training=False,
             )
-            pooled_embeddings = ops.concatenate(
-                [clip_l_projection, clip_g_projection],
-                axis=-1,
-            )
-            embeddings = ops.concatenate(
-                [
-                    clip_l_outputs["intermediate_output"],
-                    clip_g_outputs["intermediate_output"],
-                ],
-                axis=-1,
-            )
-            embeddings = ops.pad(
-                embeddings,
-                [[0, 0], [0, 0], [0, t5_hidden_dim - clip_hidden_dim]],
+            pooled_embeddings, embeddings = self.clip_concatenate(
+                clip_l_projection,
+                clip_g_projection,
+                clip_l_outputs["intermediate_output"],
+                clip_g_outputs["intermediate_output"],
+                padding=t5_hidden_dim - clip_hidden_dim,
             )
             if self.t5 is not None:
-                t5_outputs = self.t5(token_ids["t5"], training=False)
+                t5_outputs = self.t5(
+                    {
+                        "token_ids": token_ids["t5"],
+                        "padding_mask": ops.ones_like(token_ids["t5"]),
+                    },
+                    training=False,
+                )
                 embeddings = ops.concatenate([embeddings, t5_outputs], axis=-2)
             else:
                 padded_size = self.clip_l.max_sequence_length
@@ -490,9 +524,7 @@ class StableDiffusion3Backbone(Backbone):
     def encode_image_step(self, images):
         latents = self.vae.encode(images)
-        return ops.multiply(
-            ops.subtract(latents, self.vae.shift), self.vae.scale
-        )
+        return self.image_rescaling(latents)
     def add_noise_step(self, latents, noises, step, num_steps):
         return self.scheduler.add_noise(latents, noises, step, num_steps)

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py ADDED Viewed

@@ -0,0 +1,171 @@
+from keras import ops
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.image_to_image import ImageToImage
+from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_backbone import (
+    StableDiffusion3Backbone,
+)
+from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_text_to_image_preprocessor import (
+    StableDiffusion3TextToImagePreprocessor,
+)
+@keras_hub_export("keras_hub.models.StableDiffusion3ImageToImage")
+class StableDiffusion3ImageToImage(ImageToImage):
+    """An end-to-end Stable Diffusion 3 model for image-to-image generation.
+    This model has a `generate()` method, which generates images based
+    on a combination of a reference image and a text prompt.
+    Args:
+        backbone: A `keras_hub.models.StableDiffusion3Backbone` instance.
+        preprocessor: A
+            `keras_hub.models.StableDiffusion3TextToImagePreprocessor` instance.
+    Examples:
+    Use `generate()` to do image generation.
+    ```python
+    image_to_image = keras_hub.models.StableDiffusion3ImageToImage.from_preset(
+        "stable_diffusion_3_medium", height=512, width=512
+    )
+    image_to_image.generate(
+        {
+            "images": np.ones((512, 512, 3), dtype="float32"),
+            "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        }
+    )
+    # Generate with batched prompts.
+    image_to_image.generate(
+        {
+            "images": np.ones((2, 512, 512, 3), dtype="float32"),
+            "prompts": ["cute wallpaper art of a cat", "cute wallpaper art of a dog"],
+        }
+    )
+    # Generate with different `num_steps`, `guidance_scale` and `strength`.
+    image_to_image.generate(
+        {
+            "images": np.ones((512, 512, 3), dtype="float32"),
+            "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        }
+        num_steps=50,
+        guidance_scale=5.0,
+        strength=0.6,
+    )
+    # Generate with `negative_prompts`.
+    text_to_image.generate(
+        {
+            "images": np.ones((512, 512, 3), dtype="float32"),
+            "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+            "negative_prompts": "green color",
+        }
+    )
+    ```
+    """
+    backbone_cls = StableDiffusion3Backbone
+    preprocessor_cls = StableDiffusion3TextToImagePreprocessor
+    def __init__(
+        self,
+        backbone,
+        preprocessor,
+        **kwargs,
+    ):
+        # === Layers ===
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+        # === Functional Model ===
+        inputs = backbone.input
+        outputs = backbone.output
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
+    def fit(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Currently, `fit` is not supported for "
+            "`StableDiffusion3ImageToImage`."
+        )
+    def generate_step(
+        self,
+        images,
+        noises,
+        token_ids,
+        starting_step,
+        num_steps,
+        guidance_scale,
+    ):
+        """A compilable generation function for batched of inputs.
+        This function represents the inner, XLA-compilable, generation function
+        for batched inputs.
+        Args:
+            images: A (batch_size, image_height, image_width, 3) tensor
+                containing the reference images.
+            noises: A (batch_size, latent_height, latent_width, channels) tensor
+                containing the noises to be added to the latents. Typically,
+                this tensor is sampled from the Gaussian distribution.
+            token_ids: A pair of (batch_size, num_tokens) tensor containing the
+                tokens based on the input prompts and negative prompts.
+            starting_step: int. The number of the starting diffusion step.
+            num_steps: int. The number of diffusion steps to take.
+            guidance_scale: float. The classifier free guidance scale defined in
+                [Classifier-Free Diffusion Guidance](
+                https://arxiv.org/abs/2207.12598). Higher scale encourages to
+                generate images that are closely linked to prompts, usually at
+                the expense of lower image quality.
+        """
+        token_ids, negative_token_ids = token_ids
+        # Encode images.
+        latents = self.backbone.encode_image_step(images)
+        # Add noises to latents.
+        latents = self.backbone.add_noise_step(
+            latents, noises, starting_step, num_steps
+        )
+        # Encode inputs.
+        embeddings = self.backbone.encode_text_step(
+            token_ids, negative_token_ids
+        )
+        # Denoise.
+        def body_fun(step, latents):
+            return self.backbone.denoise_step(
+                latents,
+                embeddings,
+                step,
+                num_steps,
+                guidance_scale,
+            )
+        latents = ops.fori_loop(starting_step, num_steps, body_fun, latents)
+        # Decode.
+        return self.backbone.decode_step(latents)
+    def generate(
+        self,
+        inputs,
+        num_steps=50,
+        guidance_scale=7.0,
+        strength=0.8,
+        seed=None,
+    ):
+        return super().generate(
+            inputs,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            seed=seed,
+        )

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py ADDED Viewed

@@ -0,0 +1,194 @@
+from keras import ops
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.inpaint import Inpaint
+from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_backbone import (
+    StableDiffusion3Backbone,
+)
+from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_text_to_image_preprocessor import (
+    StableDiffusion3TextToImagePreprocessor,
+)
+@keras_hub_export("keras_hub.models.StableDiffusion3Inpaint")
+class StableDiffusion3Inpaint(Inpaint):
+    """An end-to-end Stable Diffusion 3 model for inpaint generation.
+    This model has a `generate()` method, which generates images based
+    on a combination of a reference image, mask and a text prompt.
+    Args:
+        backbone: A `keras_hub.models.StableDiffusion3Backbone` instance.
+        preprocessor: A
+            `keras_hub.models.StableDiffusion3TextToImagePreprocessor` instance.
+    Examples:
+    Use `generate()` to do image generation.
+    ```python
+    reference_image = np.ones((1024, 1024, 3), dtype="float32")
+    reference_mask = np.ones((1024, 1024), dtype="float32")
+    inpaint = keras_hub.models.StableDiffusion3Inpaint.from_preset(
+        "stable_diffusion_3_medium", height=512, width=512
+    )
+    inpaint.generate(
+        reference_image,
+        reference_mask,
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+    )
+    # Generate with batched prompts.
+    reference_images = np.ones((2, 512, 512, 3), dtype="float32")
+    reference_mask = np.ones((2, 1024, 1024), dtype="float32")
+    inpaint.generate(
+        reference_images,
+        reference_mask,
+        ["cute wallpaper art of a cat", "cute wallpaper art of a dog"]
+    )
+    # Generate with different `num_steps`, `guidance_scale` and `strength`.
+    inpaint.generate(
+        reference_image,
+        reference_mask,
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        num_steps=50,
+        guidance_scale=5.0,
+        strength=0.6,
+    )
+    ```
+    """
+    backbone_cls = StableDiffusion3Backbone
+    preprocessor_cls = StableDiffusion3TextToImagePreprocessor
+    def __init__(
+        self,
+        backbone,
+        preprocessor,
+        **kwargs,
+    ):
+        # === Layers ===
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+        # === Functional Model ===
+        inputs = backbone.input
+        outputs = backbone.output
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
+    def fit(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Currently, `fit` is not supported for "
+            "`StableDiffusion3Inpaint`."
+        )
+    def generate_step(
+        self,
+        images,
+        masks,
+        noises,
+        token_ids,
+        starting_step,
+        num_steps,
+        guidance_scale,
+    ):
+        """A compilable generation function for batched of inputs.
+        This function represents the inner, XLA-compilable, generation function
+        for batched inputs.
+        Args:
+            images: A (batch_size, image_height, image_width, 3) tensor
+                containing the reference images.
+            masks: A (batch_size, image_height, image_width) tensor
+                containing the reference masks.
+            noises: A (batch_size, latent_height, latent_width, channels) tensor
+                containing the noises to be added to the latents. Typically,
+                this tensor is sampled from the Gaussian distribution.
+            token_ids: A pair of (batch_size, num_tokens) tensor containing the
+                tokens based on the input prompts and negative prompts.
+            starting_step: int. The number of the starting diffusion step.
+            num_steps: int. The number of diffusion steps to take.
+            guidance_scale: float. The classifier free guidance scale defined in
+                [Classifier-Free Diffusion Guidance](
+                https://arxiv.org/abs/2207.12598). Higher scale encourages to
+                generate images that are closely linked to prompts, usually at
+                the expense of lower image quality.
+        """
+        token_ids, negative_token_ids = token_ids
+        # Get masked images.
+        masks = ops.cast(ops.expand_dims(masks, axis=-1) > 0.5, images.dtype)
+        masks_latent_size = ops.image.resize(
+            masks,
+            (self.backbone.latent_shape[1], self.backbone.latent_shape[2]),
+            interpolation="nearest",
+        )
+        # Encode images.
+        image_latents = self.backbone.encode_image_step(images)
+        # Add noises to latents.
+        latents = self.backbone.add_noise_step(
+            image_latents, noises, starting_step, num_steps
+        )
+        # Encode inputs.
+        embeddings = self.backbone.encode_text_step(
+            token_ids, negative_token_ids
+        )
+        # Denoise.
+        def body_fun(step, latents):
+            latents = self.backbone.denoise_step(
+                latents,
+                embeddings,
+                step,
+                num_steps,
+                guidance_scale,
+            )
+            # Compute the previous latents x_t -> x_t-1.
+            def true_fn():
+                next_step = ops.add(step, 1)
+                return self.backbone.add_noise_step(
+                    image_latents, noises, next_step, num_steps
+                )
+            init_latents = ops.cond(
+                step < ops.subtract(num_steps, 1),
+                true_fn,
+                lambda: ops.cast(image_latents, noises.dtype),
+            )
+            latents = ops.add(
+                ops.multiply(
+                    ops.subtract(1.0, masks_latent_size), init_latents
+                ),
+                ops.multiply(masks_latent_size, latents),
+            )
+            return latents
+        latents = ops.fori_loop(starting_step, num_steps, body_fun, latents)
+        # Decode.
+        return self.backbone.decode_step(latents)
+    def generate(
+        self,
+        inputs,
+        num_steps=50,
+        guidance_scale=7.0,
+        strength=0.6,
+        seed=None,
+    ):
+        return super().generate(
+            inputs,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            seed=seed,
+        )

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py CHANGED Viewed

@@ -13,6 +13,6 @@ backbone_presets = {
             "path": "stablediffusion3",
             "model_card": "https://arxiv.org/abs/2110.00476",
         },
-        "kaggle_handle": "kaggle://kerashub/stablediffusion3/keras/stable_diffusion_3_medium/3",
+        "kaggle_handle": "kaggle://kerashub/stablediffusion3/keras/stable_diffusion_3_medium/4",
     }
 }

keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py CHANGED Viewed

@@ -44,6 +44,14 @@ class StableDiffusion3TextToImage(TextToImage):
         num_steps=50,
         guidance_scale=5.0,
     )
+    # Generate with `negative_prompts`.
+    text_to_image.generate(
+        {
+            "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+            "negative_prompts": "green color",
+        }
+    )
     ```
     """
@@ -79,7 +87,6 @@ class StableDiffusion3TextToImage(TextToImage):
         self,
         latents,
         token_ids,
-        negative_token_ids,
         num_steps,
         guidance_scale,
     ):
@@ -92,10 +99,8 @@ class StableDiffusion3TextToImage(TextToImage):
             latents: A (batch_size, height, width, channels) tensor
                 containing the latents to start generation from. Typically, this
                 tensor is sampled from the Gaussian distribution.
-            token_ids: A (batch_size, num_tokens) tensor containing the
-                tokens based on the input prompts.
-            negative_token_ids: A (batch_size, num_tokens) tensor
-                 containing the negative tokens based on the input prompts.
+            token_ids: A pair of (batch_size, num_tokens) tensor containing the
+                tokens based on the input prompts and negative prompts.
             num_steps: int. The number of diffusion steps to take.
             guidance_scale: float. The classifier free guidance scale defined in
                 [Classifier-Free Diffusion Guidance](
@@ -103,7 +108,9 @@ class StableDiffusion3TextToImage(TextToImage):
                 generate images that are closely linked to prompts, usually at
                 the expense of lower image quality.
         """
-        # Encode inputs.
+        token_ids, negative_token_ids = token_ids
+        # Encode prompts.
         embeddings = self.backbone.encode_text_step(
             token_ids, negative_token_ids
         )
@@ -126,14 +133,12 @@ class StableDiffusion3TextToImage(TextToImage):
     def generate(
         self,
         inputs,
-        negative_inputs=None,
         num_steps=28,
         guidance_scale=7.0,
         seed=None,
     ):
         return super().generate(
             inputs,
-            negative_inputs=negative_inputs,
             num_steps=num_steps,
             guidance_scale=guidance_scale,
             seed=seed,

keras_hub/src/models/task.py CHANGED Viewed

@@ -339,7 +339,7 @@ class Task(PipelineModel):
                         add_layer(layer, info)
                     elif isinstance(layer, ImageConverter):
                         info = "Image size: "
-                        info += highlight_shape(layer.image_size())
+                        info += highlight_shape(layer.image_size)
                         add_layer(layer, info)
                     elif isinstance(layer, AudioConverter):
                         info = "Audio shape: "

keras-hub-nightly 0.16.1.dev202410080341__py3-none-any.whl → 0.16.1.dev202410100339__py3-none-any.whl

keras-hub-nightly 0.16.1.dev202410080341py3-none-any.whl → 0.16.1.dev202410100339py3-none-any.whl