PyPI - keras-hub-nightly - Versions diffs - 0.16.1.dev202409250340__py3-none-any.whl → 0.16.1.dev202409260340__py3-none-any.whl - Mend

keras-hub-nightly 0.16.1.dev202409250340py3-none-any.whl → 0.16.1.dev202409260340py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

keras_hub/src/models/{stable_diffusion_v3 → stable_diffusion_3}/mmdit.py RENAMED Viewed

@@ -19,7 +19,8 @@ from keras import models
 from keras import ops
 from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
-from keras_hub.src.models.stable_diffusion_v3.mmdit_block import MMDiTBlock
+from keras_hub.src.models.backbone import Backbone
+from keras_hub.src.utils.keras_utils import gelu_approximate
 from keras_hub.src.utils.keras_utils import standardize_data_format
@@ -79,8 +80,8 @@ class AdjustablePositionEmbedding(PositionEmbedding):
         width = width or self.width
         shape = ops.shape(inputs)
         feature_length = shape[-1]
-        top = ops.floor_divide(self.height - height, 2)
-        left = ops.floor_divide(self.width - width, 2)
+        top = ops.cast(ops.floor_divide(self.height - height, 2), "int32")
+        left = ops.cast(ops.floor_divide(self.width - width, 2), "int32")
         position_embedding = ops.convert_to_tensor(self.position_embeddings)
         position_embedding = ops.reshape(
             position_embedding, (self.height, self.width, feature_length)
@@ -166,6 +167,305 @@ class TimestepEmbedding(layers.Layer):
         return output_shape
+class DismantledBlock(layers.Layer):
+    def __init__(
+        self,
+        num_heads,
+        hidden_dim,
+        mlp_ratio=4.0,
+        use_projection=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.mlp_ratio = mlp_ratio
+        self.use_projection = use_projection
+        head_dim = hidden_dim // num_heads
+        self.head_dim = head_dim
+        mlp_hidden_dim = int(hidden_dim * mlp_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        num_modulations = 6 if use_projection else 2
+        self.num_modulations = num_modulations
+        self.adaptive_norm_modulation = models.Sequential(
+            [
+                layers.Activation("silu", dtype=self.dtype_policy),
+                layers.Dense(
+                    num_modulations * hidden_dim, dtype=self.dtype_policy
+                ),
+            ],
+            name="adaptive_norm_modulation",
+        )
+        self.norm1 = layers.LayerNormalization(
+            epsilon=1e-6,
+            center=False,
+            scale=False,
+            dtype="float32",
+            name="norm1",
+        )
+        self.attention_qkv = layers.Dense(
+            hidden_dim * 3, dtype=self.dtype_policy, name="attention_qkv"
+        )
+        if use_projection:
+            self.attention_proj = layers.Dense(
+                hidden_dim, dtype=self.dtype_policy, name="attention_proj"
+            )
+            self.norm2 = layers.LayerNormalization(
+                epsilon=1e-6,
+                center=False,
+                scale=False,
+                dtype="float32",
+                name="norm2",
+            )
+            self.mlp = models.Sequential(
+                [
+                    layers.Dense(
+                        mlp_hidden_dim,
+                        activation=gelu_approximate,
+                        dtype=self.dtype_policy,
+                    ),
+                    layers.Dense(
+                        hidden_dim,
+                        dtype=self.dtype_policy,
+                    ),
+                ],
+                name="mlp",
+            )
+    def build(self, inputs_shape, timestep_embedding):
+        self.adaptive_norm_modulation.build(timestep_embedding)
+        self.attention_qkv.build(inputs_shape)
+        self.norm1.build(inputs_shape)
+        if self.use_projection:
+            self.attention_proj.build(inputs_shape)
+            self.norm2.build(inputs_shape)
+            self.mlp.build(inputs_shape)
+    def _modulate(self, inputs, shift, scale):
+        shift = ops.expand_dims(shift, axis=1)
+        scale = ops.expand_dims(scale, axis=1)
+        return ops.add(ops.multiply(inputs, ops.add(scale, 1.0)), shift)
+    def _compute_pre_attention(self, inputs, timestep_embedding, training=None):
+        batch_size = ops.shape(inputs)[0]
+        if self.use_projection:
+            modulation = self.adaptive_norm_modulation(
+                timestep_embedding, training=training
+            )
+            modulation = ops.reshape(
+                modulation, (batch_size, 6, self.hidden_dim)
+            )
+            (
+                shift_msa,
+                scale_msa,
+                gate_msa,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+            ) = ops.unstack(modulation, 6, axis=1)
+            qkv = self.attention_qkv(
+                self._modulate(self.norm1(inputs), shift_msa, scale_msa),
+                training=training,
+            )
+            qkv = ops.reshape(
+                qkv, (batch_size, -1, 3, self.num_heads, self.head_dim)
+            )
+            q, k, v = ops.unstack(qkv, 3, axis=2)
+            return (q, k, v), (inputs, gate_msa, shift_mlp, scale_mlp, gate_mlp)
+        else:
+            modulation = self.adaptive_norm_modulation(
+                timestep_embedding, training=training
+            )
+            modulation = ops.reshape(
+                modulation, (batch_size, 2, self.hidden_dim)
+            )
+            shift_msa, scale_msa = ops.unstack(modulation, 2, axis=1)
+            qkv = self.attention_qkv(
+                self._modulate(self.norm1(inputs), shift_msa, scale_msa),
+                training=training,
+            )
+            qkv = ops.reshape(
+                qkv, (batch_size, -1, 3, self.num_heads, self.head_dim)
+            )
+            q, k, v = ops.unstack(qkv, 3, axis=2)
+            return (q, k, v)
+    def _compute_post_attention(
+        self, inputs, inputs_intermediates, training=None
+    ):
+        x, gate_msa, shift_mlp, scale_mlp, gate_mlp = inputs_intermediates
+        attn = self.attention_proj(inputs, training=training)
+        x = ops.add(x, ops.multiply(ops.expand_dims(gate_msa, axis=1), attn))
+        x = ops.add(
+            x,
+            ops.multiply(
+                ops.expand_dims(gate_mlp, axis=1),
+                self.mlp(
+                    self._modulate(self.norm2(x), shift_mlp, scale_mlp),
+                    training=training,
+                ),
+            ),
+        )
+        return x
+    def call(
+        self,
+        inputs,
+        timestep_embedding=None,
+        inputs_intermediates=None,
+        pre_attention=True,
+        training=None,
+    ):
+        if pre_attention:
+            return self._compute_pre_attention(
+                inputs, timestep_embedding, training=training
+            )
+        else:
+            return self._compute_post_attention(
+                inputs, inputs_intermediates, training=training
+            )
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_heads": self.num_heads,
+                "hidden_dim": self.hidden_dim,
+                "mlp_ratio": self.mlp_ratio,
+                "use_projection": self.use_projection,
+            }
+        )
+        return config
+class MMDiTBlock(layers.Layer):
+    def __init__(
+        self,
+        num_heads,
+        hidden_dim,
+        mlp_ratio=4.0,
+        use_context_projection=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.mlp_ratio = mlp_ratio
+        self.use_context_projection = use_context_projection
+        head_dim = hidden_dim // num_heads
+        self.head_dim = head_dim
+        self._inverse_sqrt_key_dim = 1.0 / math.sqrt(head_dim)
+        self._dot_product_equation = "aecd,abcd->acbe"
+        self._combine_equation = "acbe,aecd->abcd"
+        self.x_block = DismantledBlock(
+            num_heads=num_heads,
+            hidden_dim=hidden_dim,
+            mlp_ratio=mlp_ratio,
+            use_projection=True,
+            dtype=self.dtype_policy,
+            name="x_block",
+        )
+        self.context_block = DismantledBlock(
+            num_heads=num_heads,
+            hidden_dim=hidden_dim,
+            mlp_ratio=mlp_ratio,
+            use_projection=use_context_projection,
+            dtype=self.dtype_policy,
+            name="context_block",
+        )
+        self.softmax = layers.Softmax(dtype="float32")
+    def build(self, inputs_shape, context_shape, timestep_embedding_shape):
+        self.x_block.build(inputs_shape, timestep_embedding_shape)
+        self.context_block.build(context_shape, timestep_embedding_shape)
+    def _compute_attention(self, query, key, value):
+        query = ops.multiply(
+            query, ops.cast(self._inverse_sqrt_key_dim, query.dtype)
+        )
+        attention_scores = ops.einsum(self._dot_product_equation, key, query)
+        attention_scores = self.softmax(attention_scores)
+        attention_scores = ops.cast(attention_scores, self.compute_dtype)
+        attention_output = ops.einsum(
+            self._combine_equation, attention_scores, value
+        )
+        batch_size = ops.shape(attention_output)[0]
+        attention_output = ops.reshape(
+            attention_output, (batch_size, -1, self.num_heads * self.head_dim)
+        )
+        return attention_output
+    def call(self, inputs, context, timestep_embedding, training=None):
+        # Compute pre-attention.
+        x = inputs
+        if self.use_context_projection:
+            context_qkv, context_intermediates = self.context_block(
+                context,
+                timestep_embedding=timestep_embedding,
+                training=training,
+            )
+        else:
+            context_qkv = self.context_block(
+                context,
+                timestep_embedding=timestep_embedding,
+                training=training,
+            )
+        context_len = ops.shape(context_qkv[0])[1]
+        x_qkv, x_intermediates = self.x_block(
+            x, timestep_embedding=timestep_embedding, training=training
+        )
+        q = ops.concatenate([context_qkv[0], x_qkv[0]], axis=1)
+        k = ops.concatenate([context_qkv[1], x_qkv[1]], axis=1)
+        v = ops.concatenate([context_qkv[2], x_qkv[2]], axis=1)
+        # Compute attention.
+        attention = self._compute_attention(q, k, v)
+        context_attention = attention[:, :context_len]
+        x_attention = attention[:, context_len:]
+        # Compute post-attention.
+        x = self.x_block(
+            x_attention,
+            inputs_intermediates=x_intermediates,
+            pre_attention=False,
+            training=training,
+        )
+        if self.use_context_projection:
+            context = self.context_block(
+                context_attention,
+                inputs_intermediates=context_intermediates,
+                pre_attention=False,
+                training=training,
+            )
+            return x, context
+        else:
+            return x
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_heads": self.num_heads,
+                "hidden_dim": self.hidden_dim,
+                "mlp_ratio": self.mlp_ratio,
+                "use_context_projection": self.use_context_projection,
+            }
+        )
+        return config
+    def compute_output_shape(
+        self, inputs_shape, context_shape, timestep_embedding_shape
+    ):
+        if self.use_context_projection:
+            return inputs_shape, context_shape
+        else:
+            return inputs_shape
 class OutputLayer(layers.Layer):
     def __init__(self, hidden_dim, output_dim, **kwargs):
         super().__init__(**kwargs)
@@ -186,11 +486,11 @@ class OutputLayer(layers.Layer):
             epsilon=1e-6,
             center=False,
             scale=False,
-            dtype=self.dtype_policy,
+            dtype="float32",
             name="norm",
         )
         self.output_dense = layers.Dense(
-            output_dim,  # patch_size ** 2 * input_channels
+            output_dim,
             use_bias=True,
             dtype=self.dtype_policy,
             name="output_dense",
@@ -227,6 +527,11 @@ class OutputLayer(layers.Layer):
         )
         return config
+    def compute_output_shape(self, inputs_shape):
+        outputs_shape = list(inputs_shape)
+        outputs_shape[-1] = self.output_dim
+        return outputs_shape
 class Unpatch(layers.Layer):
     def __init__(self, patch_size, output_dim, **kwargs):
@@ -263,18 +568,48 @@ class Unpatch(layers.Layer):
         return [inputs_shape[0], None, None, self.output_dim]
-class MMDiT(keras.Model):
+class MMDiT(Backbone):
+    """Multimodal Diffusion Transformer (MMDiT) model for Stable Diffusion 3.
+    MMDiT is introduced in [
+    Scaling Rectified Flow Transformers for High-Resolution Image Synthesis](
+    https://arxiv.org/abs/2403.03206).
+    Args:
+        patch_size: int. The size of each square patch in the input image.
+        hidden_dim: int. The size of the transformer hidden state at the end
+            of each transformer layer.
+        num_layers: int. The number of transformer layers.
+        num_heads: int. The number of attention heads for each transformer.
+        position_size: int. The size of the height and width for the position
+            embedding.
+        mlp_ratio: float. The ratio of the mlp hidden dim to the transformer
+        latent_shape: tuple. The shape of the latent image.
+        context_shape: tuple. The shape of the context.
+        pooled_projection_shape: tuple. The shape of the pooled projection.
+        data_format: `None` or str. If specified, either `"channels_last"` or
+            `"channels_first"`. The ordering of the dimensions in the
+            inputs. `"channels_last"` corresponds to inputs with shape
+            `(batch_size, height, width, channels)`
+            while `"channels_first"` corresponds to inputs with shape
+            `(batch_size, channels, height, width)`. It defaults to the
+            `image_data_format` value found in your Keras config file at
+            `~/.keras/keras.json`. If you never set it, then it will be
+            `"channels_last"`.
+        dtype: `None` or str or `keras.mixed_precision.DTypePolicy`. The dtype
+            to use for the model's computations and weights.
+    """
     def __init__(
         self,
         patch_size,
-        num_heads,
         hidden_dim,
-        depth,
+        num_layers,
+        num_heads,
         position_size,
-        output_dim,
         mlp_ratio=4.0,
         latent_shape=(64, 64, 16),
-        context_shape=(1024, 4096),
+        context_shape=(None, 4096),
         pooled_projection_shape=(2048,),
         data_format=None,
         dtype=None,
@@ -287,6 +622,7 @@ class MMDiT(keras.Model):
             )
         image_height = latent_shape[0] // patch_size
         image_width = latent_shape[1] // patch_size
+        output_dim = latent_shape[-1]
         output_dim_in_final = patch_size**2 * output_dim
         data_format = standardize_data_format(data_format)
         if data_format != "channels_last":
@@ -331,11 +667,11 @@ class MMDiT(keras.Model):
                 num_heads,
                 hidden_dim,
                 mlp_ratio,
-                use_context_projection=not (i == depth - 1),
+                use_context_projection=not (i == num_layers - 1),
                 dtype=dtype,
                 name=f"joint_block_{i}",
             )
-            for i in range(depth)
+            for i in range(num_layers)
         ]
         self.output_layer = OutputLayer(
             hidden_dim, output_dim_in_final, dtype=dtype, name="output_layer"
@@ -391,33 +727,22 @@ class MMDiT(keras.Model):
         self.patch_size = patch_size
         self.num_heads = num_heads
         self.hidden_dim = hidden_dim
-        self.depth = depth
+        self.num_layers = num_layers
         self.position_size = position_size
-        self.output_dim = output_dim
         self.mlp_ratio = mlp_ratio
         self.latent_shape = latent_shape
         self.context_shape = context_shape
         self.pooled_projection_shape = pooled_projection_shape
-        if dtype is not None:
-            try:
-                self.dtype_policy = keras.dtype_policies.get(dtype)
-            # Before Keras 3.2, there is no `keras.dtype_policies.get`.
-            except AttributeError:
-                if isinstance(dtype, keras.DTypePolicy):
-                    dtype = dtype.name
-                self.dtype_policy = keras.DTypePolicy(dtype)
     def get_config(self):
         config = super().get_config()
         config.update(
             {
                 "patch_size": self.patch_size,
-                "num_heads": self.num_heads,
                 "hidden_dim": self.hidden_dim,
-                "depth": self.depth,
+                "num_layers": self.num_layers,
+                "num_heads": self.num_heads,
                 "position_size": self.position_size,
-                "output_dim": self.output_dim,
                 "mlp_ratio": self.mlp_ratio,
                 "latent_shape": self.latent_shape,
                 "context_shape": self.context_shape,

keras-hub-nightly 0.16.1.dev202409250340__py3-none-any.whl → 0.16.1.dev202409260340__py3-none-any.whl

keras-hub-nightly 0.16.1.dev202409250340py3-none-any.whl → 0.16.1.dev202409260340py3-none-any.whl