PyPI - keras-hub-nightly - Versions diffs - 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev20240915160609__py3-none-any.whl - Mend

keras-hub-nightly 0.15.0.dev20240823171555py3-none-any.whl → 0.16.0.dev20240915160609py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

keras_hub/src/models/stable_diffusion_v3/clip_tokenizer.py ADDED Viewed

@@ -0,0 +1,167 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras_hub.src.tokenizers.byte_pair_tokenizer import BytePairTokenizer
+from keras_hub.src.tokenizers.byte_pair_tokenizer import convert_to_ragged_batch
+from keras_hub.src.tokenizers.byte_pair_tokenizer import split_strings_for_bpe
+try:
+    import tensorflow as tf
+except ImportError:
+    tf = None
+class CLIPTokenizer(BytePairTokenizer):
+    def __init__(self, vocabulary=None, merges=None, **kwargs):
+        self.start_token = "<|startoftext|>"
+        self.end_token = "<|endoftext|>"
+        super().__init__(
+            vocabulary=vocabulary,
+            merges=merges,
+            unsplittable_tokens=[self.start_token, self.end_token],
+            **kwargs,
+        )
+    def set_vocabulary_and_merges(self, vocabulary, merges):
+        super().set_vocabulary_and_merges(vocabulary, merges)
+        if vocabulary is not None:
+            # Check for necessary special tokens.
+            if self.end_token not in self.get_vocabulary():
+                raise ValueError(
+                    f"Cannot find token `'{self.end_token}'` in the provided "
+                    f"`vocabulary`. Please provide `'{self.end_token}'` in "
+                    "your `vocabulary` or use a pretrained `vocabulary` name."
+                )
+            self.start_token_id = self.token_to_id(self.start_token)
+            self.end_token_id = self.token_to_id(self.end_token)
+            self.pad_token_id = 0
+        else:
+            self.end_token_id = None
+            self.start_token_id = None
+            self.pad_token_id = None
+    def _bpe_merge_and_update_cache(self, tokens):
+        """Process unseen tokens and add to cache."""
+        words = self._transform_bytes(tokens)
+        # In StableDiffusionV3, we need to add `</w>` to the last word.
+        words = tf.strings.reduce_join(words, axis=1, separator=" ")
+        words = tf.strings.join([words, "</w>"])
+        words = tf.strings.split(words, sep=" ")
+        tokenized_words = self._bpe_merge(words)
+        # For each word, join all its token by a whitespace,
+        # e.g., ["dragon", "fly"] => "dragon fly" for hash purpose.
+        tokenized_words = tf.strings.reduce_join(
+            tokenized_words, axis=1, separator=" "
+        )
+        self.cache.insert(tokens, tokenized_words)
+    def tokenize(self, inputs):
+        self._check_vocabulary()
+        if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
+            inputs = tf.convert_to_tensor(inputs)
+        if self.add_prefix_space:
+            inputs = tf.strings.join([" ", inputs])
+        scalar_input = inputs.shape.rank == 0
+        if scalar_input:
+            inputs = tf.expand_dims(inputs, 0)
+        raw_tokens = split_strings_for_bpe(inputs, self.unsplittable_tokens)
+        # Strip and remove empty tokens.
+        raw_tokens = tf.strings.strip(raw_tokens)
+        raw_tokens = tf.ragged.boolean_mask(raw_tokens, raw_tokens != "")
+        token_row_splits = raw_tokens.row_splits
+        flat_tokens = raw_tokens.flat_values
+        # Check cache.
+        cache_lookup = self.cache.lookup(flat_tokens)
+        cache_mask = cache_lookup == ""
+        has_unseen_words = tf.math.reduce_any(
+            (cache_lookup == "") & (flat_tokens != "")
+        )
+        def process_unseen_tokens():
+            unseen_tokens = tf.boolean_mask(flat_tokens, cache_mask)
+            self._bpe_merge_and_update_cache(unseen_tokens)
+            return self.cache.lookup(flat_tokens)
+        # If `has_unseen_words == True`, it means not all tokens are in cache,
+        # we will process the unseen tokens. Otherwise return the cache lookup.
+        tokenized_words = tf.cond(
+            has_unseen_words,
+            process_unseen_tokens,
+            lambda: cache_lookup,
+        )
+        tokens = tf.strings.split(tokenized_words, sep=" ")
+        if self.compute_dtype != tf.string:
+            # Encode merged tokens.
+            tokens = self.token_to_id_map.lookup(tokens)
+        # Unflatten to match input.
+        tokens = tf.RaggedTensor.from_row_splits(
+            tokens.flat_values,
+            tf.gather(tokens.row_splits, token_row_splits),
+        )
+        # Convert to a dense output if `sequence_length` is set.
+        if self.sequence_length:
+            output_shape = tokens.shape.as_list()
+            output_shape[-1] = self.sequence_length
+            tokens = tokens.to_tensor(shape=output_shape)
+        # Convert to a dense output if input in scalar
+        if scalar_input:
+            tokens = tf.squeeze(tokens, 0)
+            tf.ensure_shape(tokens, shape=[self.sequence_length])
+        return tokens
+    def detokenize(self, inputs):
+        self._check_vocabulary()
+        inputs, unbatched, _ = convert_to_ragged_batch(inputs)
+        inputs = tf.cast(inputs, self.dtype)
+        unicode_text = tf.strings.reduce_join(
+            self.id_to_token_map.lookup(inputs), axis=-1
+        )
+        # When detokenizing, we need to remove </w> and extra whitespace.
+        unicode_text = tf.strings.regex_replace(unicode_text, r"</w>", " ")
+        unicode_text = tf.strings.strip(unicode_text)
+        split_unicode_text = tf.strings.unicode_split(unicode_text, "UTF-8")
+        outputs = tf.strings.reduce_join(
+            self.unicode2byte.lookup(split_unicode_text), axis=-1
+        )
+        if unbatched:
+            outputs = tf.squeeze(outputs, 0)
+        return outputs
+    def get_config(self):
+        config = super().get_config()
+        # In the constructor, we pass the list of special tokens to the
+        # `unsplittable_tokens` arg of the superclass' constructor. Hence, we
+        # delete it from the config here.
+        del config["unsplittable_tokens"]
+        return config

keras_hub/src/models/stable_diffusion_v3/mmdit.py ADDED Viewed

@@ -0,0 +1,427 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import keras
+from keras import layers
+from keras import models
+from keras import ops
+from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
+from keras_hub.src.models.stable_diffusion_v3.mmdit_block import MMDiTBlock
+from keras_hub.src.utils.keras_utils import standardize_data_format
+class PatchEmbedding(layers.Layer):
+    def __init__(self, patch_size, hidden_dim, data_format=None, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_size = int(patch_size)
+        self.hidden_dim = int(hidden_dim)
+        data_format = standardize_data_format(data_format)
+        self.patch_embedding = layers.Conv2D(
+            hidden_dim,
+            kernel_size=patch_size,
+            strides=patch_size,
+            data_format=data_format,
+            dtype=self.dtype_policy,
+            name="patch_embedding",
+        )
+    def build(self, input_shape):
+        self.patch_embedding.build(input_shape)
+    def call(self, inputs):
+        x = self.patch_embedding(inputs)
+        x_shape = ops.shape(x)
+        x = ops.reshape(x, (x_shape[0], x_shape[1] * x_shape[2], x_shape[3]))
+        return x
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "patch_size": self.patch_size,
+                "hidden_dim": self.hidden_dim,
+            }
+        )
+        return config
+class AdjustablePositionEmbedding(PositionEmbedding):
+    def __init__(
+        self,
+        height,
+        width,
+        initializer="glorot_uniform",
+        **kwargs,
+    ):
+        height = int(height)
+        width = int(width)
+        sequence_length = height * width
+        super().__init__(sequence_length, initializer, **kwargs)
+        self.height = height
+        self.width = width
+    def call(self, inputs, height=None, width=None):
+        height = height or self.height
+        width = width or self.width
+        shape = ops.shape(inputs)
+        feature_length = shape[-1]
+        top = ops.floor_divide(self.height - height, 2)
+        left = ops.floor_divide(self.width - width, 2)
+        position_embedding = ops.convert_to_tensor(self.position_embeddings)
+        position_embedding = ops.reshape(
+            position_embedding, (self.height, self.width, feature_length)
+        )
+        position_embedding = ops.slice(
+            position_embedding,
+            (top, left, 0),
+            (height, width, feature_length),
+        )
+        position_embedding = ops.reshape(
+            position_embedding, (height * width, feature_length)
+        )
+        position_embedding = ops.expand_dims(position_embedding, axis=0)
+        return position_embedding
+    def compute_output_shape(self, input_shape):
+        return input_shape
+class TimestepEmbedding(layers.Layer):
+    def __init__(
+        self, embedding_dim, frequency_dim=256, max_period=10000, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.embedding_dim = int(embedding_dim)
+        self.frequency_dim = int(frequency_dim)
+        self.max_period = float(max_period)
+        self.half_frequency_dim = self.frequency_dim // 2
+        self.mlp = models.Sequential(
+            [
+                layers.Dense(
+                    embedding_dim, activation="silu", dtype=self.dtype_policy
+                ),
+                layers.Dense(
+                    embedding_dim, activation=None, dtype=self.dtype_policy
+                ),
+            ],
+            name="mlp",
+        )
+    def build(self, inputs_shape):
+        embedding_shape = list(inputs_shape)[1:]
+        embedding_shape.append(self.frequency_dim)
+        self.mlp.build(embedding_shape)
+    def _create_timestep_embedding(self, inputs):
+        compute_dtype = keras.backend.result_type(self.compute_dtype, "float32")
+        x = ops.cast(inputs, compute_dtype)
+        freqs = ops.exp(
+            ops.divide(
+                ops.multiply(
+                    -math.log(self.max_period),
+                    ops.arange(0, self.half_frequency_dim, dtype="float32"),
+                ),
+                self.half_frequency_dim,
+            )
+        )
+        freqs = ops.cast(freqs, compute_dtype)
+        x = ops.multiply(x, ops.expand_dims(freqs, axis=0))
+        embedding = ops.concatenate([ops.cos(x), ops.sin(x)], axis=-1)
+        if self.frequency_dim % 2 != 0:
+            embedding = ops.pad(embedding, [[0, 0], [0, 1]])
+        return ops.cast(embedding, self.compute_dtype)
+    def call(self, inputs, training=None):
+        timestep_embedding = self._create_timestep_embedding(inputs)
+        return self.mlp(timestep_embedding, training=training)
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "embedding_dim": self.embedding_dim,
+                "max_period": self.max_period,
+            }
+        )
+        return config
+    def compute_output_shape(self, inputs_shape):
+        output_shape = list(inputs_shape)[1:]
+        output_shape.append(self.embedding_dim)
+        return output_shape
+class OutputLayer(layers.Layer):
+    def __init__(self, hidden_dim, output_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+        num_modulation = 2
+        self.adaptive_norm_modulation = models.Sequential(
+            [
+                layers.Activation("silu", dtype=self.dtype_policy),
+                layers.Dense(
+                    num_modulation * hidden_dim, dtype=self.dtype_policy
+                ),
+            ],
+            name="adaptive_norm_modulation",
+        )
+        self.norm = layers.LayerNormalization(
+            epsilon=1e-6,
+            center=False,
+            scale=False,
+            dtype=self.dtype_policy,
+            name="norm",
+        )
+        self.output_dense = layers.Dense(
+            output_dim,  # patch_size ** 2 * input_channels
+            use_bias=True,
+            dtype=self.dtype_policy,
+            name="output_dense",
+        )
+    def build(self, inputs_shape, timestep_embedding_shape):
+        self.adaptive_norm_modulation.build(timestep_embedding_shape)
+        self.norm.build(inputs_shape)
+        self.output_dense.build(inputs_shape)
+    def _modulate(self, inputs, shift, scale):
+        shift = ops.expand_dims(shift, axis=1)
+        scale = ops.expand_dims(scale, axis=1)
+        return ops.add(ops.multiply(inputs, ops.add(scale, 1.0)), shift)
+    def call(self, inputs, timestep_embedding, training=None):
+        x = inputs
+        modulation = self.adaptive_norm_modulation(
+            timestep_embedding, training=training
+        )
+        modulation = ops.reshape(modulation, (-1, 2, self.hidden_dim))
+        shift, scale = ops.unstack(modulation, 2, axis=1)
+        x = self._modulate(self.norm(x), shift, scale)
+        x = self.output_dense(x, training=training)
+        return x
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "output_dim": self.output_dim,
+            }
+        )
+        return config
+class Unpatch(layers.Layer):
+    def __init__(self, patch_size, output_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_size = int(patch_size)
+        self.output_dim = int(output_dim)
+    def call(self, inputs, height, width):
+        patch_size = self.patch_size
+        output_dim = self.output_dim
+        x = ops.reshape(
+            inputs,
+            (-1, height, width, patch_size, patch_size, output_dim),
+        )
+        # (b, h, w, p1, p2, o) -> (b, h, p1, w, p2, o)
+        x = ops.transpose(x, (0, 1, 3, 2, 4, 5))
+        return ops.reshape(
+            x,
+            (-1, height * patch_size, width * patch_size, output_dim),
+        )
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "patch_size": self.patch_size,
+                "output_dim": self.output_dim,
+            }
+        )
+        return config
+    def compute_output_shape(self, inputs_shape):
+        inputs_shape = list(inputs_shape)
+        return [inputs_shape[0], None, None, self.output_dim]
+class MMDiT(keras.Model):
+    def __init__(
+        self,
+        patch_size,
+        num_heads,
+        hidden_dim,
+        depth,
+        position_size,
+        output_dim,
+        mlp_ratio=4.0,
+        latent_shape=(64, 64, 16),
+        context_shape=(1024, 4096),
+        pooled_projection_shape=(2048,),
+        data_format=None,
+        dtype=None,
+        **kwargs,
+    ):
+        if None in latent_shape:
+            raise ValueError(
+                "`latent_shape` must be fully specified. "
+                f"Received: latent_shape={latent_shape}"
+            )
+        image_height = latent_shape[0] // patch_size
+        image_width = latent_shape[1] // patch_size
+        output_dim_in_final = patch_size**2 * output_dim
+        data_format = standardize_data_format(data_format)
+        if data_format != "channels_last":
+            raise NotImplementedError(
+                "Currently only 'channels_last' is supported."
+            )
+        # === Layers ===
+        self.patch_embedding = PatchEmbedding(
+            patch_size,
+            hidden_dim,
+            data_format=data_format,
+            dtype=dtype,
+            name="patch_embedding",
+        )
+        self.position_embedding_add = layers.Add(
+            dtype=dtype, name="position_embedding_add"
+        )
+        self.position_embedding = AdjustablePositionEmbedding(
+            position_size, position_size, dtype=dtype, name="position_embedding"
+        )
+        self.context_embedding = layers.Dense(
+            hidden_dim,
+            dtype=dtype,
+            name="context_embedding",
+        )
+        self.vector_embedding = models.Sequential(
+            [
+                layers.Dense(hidden_dim, activation="silu", dtype=dtype),
+                layers.Dense(hidden_dim, activation=None, dtype=dtype),
+            ],
+            name="vector_embedding",
+        )
+        self.vector_embedding_add = layers.Add(
+            dtype=dtype, name="vector_embedding_add"
+        )
+        self.timestep_embedding = TimestepEmbedding(
+            hidden_dim, dtype=dtype, name="timestep_embedding"
+        )
+        self.joint_blocks = [
+            MMDiTBlock(
+                num_heads,
+                hidden_dim,
+                mlp_ratio,
+                use_context_projection=not (i == depth - 1),
+                dtype=dtype,
+                name=f"joint_block_{i}",
+            )
+            for i in range(depth)
+        ]
+        self.output_layer = OutputLayer(
+            hidden_dim, output_dim_in_final, dtype=dtype, name="output_layer"
+        )
+        self.unpatch = Unpatch(
+            patch_size, output_dim, dtype=dtype, name="unpatch"
+        )
+        # === Functional Model ===
+        latent_inputs = layers.Input(shape=latent_shape, name="latent")
+        context_inputs = layers.Input(shape=context_shape, name="context")
+        pooled_projection_inputs = layers.Input(
+            shape=pooled_projection_shape, name="pooled_projection"
+        )
+        timestep_inputs = layers.Input(shape=(1,), name="timestep")
+        # Embeddings.
+        x = self.patch_embedding(latent_inputs)
+        position_embedding = self.position_embedding(
+            x, height=image_height, width=image_width
+        )
+        x = self.position_embedding_add([x, position_embedding])
+        context = self.context_embedding(context_inputs)
+        pooled_projection = self.vector_embedding(pooled_projection_inputs)
+        timestep_embedding = self.timestep_embedding(timestep_inputs)
+        timestep_embedding = self.vector_embedding_add(
+            [timestep_embedding, pooled_projection]
+        )
+        # Blocks.
+        for block in self.joint_blocks:
+            if block.use_context_projection:
+                x, context = block(x, context, timestep_embedding)
+            else:
+                x = block(x, context, timestep_embedding)
+        # Output layer.
+        x = self.output_layer(x, timestep_embedding)
+        outputs = self.unpatch(x, height=image_height, width=image_width)
+        super().__init__(
+            inputs={
+                "latent": latent_inputs,
+                "context": context_inputs,
+                "pooled_projection": pooled_projection_inputs,
+                "timestep": timestep_inputs,
+            },
+            outputs=outputs,
+            **kwargs,
+        )
+        # === Config ===
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.depth = depth
+        self.position_size = position_size
+        self.output_dim = output_dim
+        self.mlp_ratio = mlp_ratio
+        self.latent_shape = latent_shape
+        self.context_shape = context_shape
+        self.pooled_projection_shape = pooled_projection_shape
+        if dtype is not None:
+            try:
+                self.dtype_policy = keras.dtype_policies.get(dtype)
+            # Before Keras 3.2, there is no `keras.dtype_policies.get`.
+            except AttributeError:
+                if isinstance(dtype, keras.DTypePolicy):
+                    dtype = dtype.name
+                self.dtype_policy = keras.DTypePolicy(dtype)
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "patch_size": self.patch_size,
+                "num_heads": self.num_heads,
+                "hidden_dim": self.hidden_dim,
+                "depth": self.depth,
+                "position_size": self.position_size,
+                "output_dim": self.output_dim,
+                "mlp_ratio": self.mlp_ratio,
+                "latent_shape": self.latent_shape,
+                "context_shape": self.context_shape,
+                "pooled_projection_shape": self.pooled_projection_shape,
+            }
+        )
+        return config

keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev20240915160609__py3-none-any.whl

keras-hub-nightly 0.15.0.dev20240823171555py3-none-any.whl → 0.16.0.dev20240915160609py3-none-any.whl