PyPI - keras-hub - Versions diffs - 0.21.1.dev0__py3-none-any.whl → 0.22.0.dev0__py3-none-any.whl - Mend

keras-hub 0.21.1.dev0py3-none-any.whl → 0.22.0.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

keras_hub/layers/__init__.py +9 -0
keras_hub/models/__init__.py +47 -0
keras_hub/src/layers/modeling/transformer_encoder.py +6 -3
keras_hub/src/layers/preprocessing/multi_segment_packer.py +17 -3
keras_hub/src/layers/preprocessing/start_end_packer.py +24 -6
keras_hub/src/models/backbone.py +13 -10
keras_hub/src/models/clip/clip_backbone.py +3 -102
keras_hub/src/models/clip/clip_layers.py +295 -0
keras_hub/src/models/clip/clip_preprocessor.py +57 -48
keras_hub/src/models/clip/clip_text_encoder.py +2 -2
keras_hub/src/models/clip/clip_vision_encoder.py +3 -3
keras_hub/src/models/deit/__init__.py +5 -0
keras_hub/src/models/deit/deit_backbone.py +154 -0
keras_hub/src/models/deit/deit_image_classifier.py +171 -0
keras_hub/src/models/deit/deit_image_classifier_preprocessor.py +12 -0
keras_hub/src/models/deit/deit_image_converter.py +8 -0
keras_hub/src/models/deit/deit_layers.py +519 -0
keras_hub/src/models/deit/deit_presets.py +49 -0
keras_hub/src/models/dinov2/__init__.py +5 -0
keras_hub/src/models/dinov2/dinov2_backbone.py +228 -0
keras_hub/src/models/dinov2/dinov2_image_converter.py +8 -0
keras_hub/src/models/dinov2/dinov2_layers.py +886 -0
keras_hub/src/models/dinov2/dinov2_presets.py +89 -0
keras_hub/src/models/esm/__init__.py +5 -0
keras_hub/src/models/esm/esm_attention.py +95 -0
keras_hub/src/models/esm/esm_backbone.py +229 -0
keras_hub/src/models/esm/esm_classifier.py +184 -0
keras_hub/src/models/esm/esm_classifier_preprocessor.py +135 -0
keras_hub/src/models/esm/esm_encoder.py +134 -0
keras_hub/src/models/esm/esm_masked_plm.py +117 -0
keras_hub/src/models/esm/esm_masked_plm_preprocessor.py +143 -0
keras_hub/src/models/esm/esm_presets.py +53 -0
keras_hub/src/models/esm/esm_tokenizer.py +82 -0
keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +6 -2
keras_hub/src/models/gemma/gemma_attention.py +1 -1
keras_hub/src/models/gemma3/gemma3_backbone.py +2 -2
keras_hub/src/models/gemma3/gemma3_interleave_embeddings.py +1 -1
keras_hub/src/models/hgnetv2/__init__.py +5 -0
keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +193 -0
keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +148 -0
keras_hub/src/models/hgnetv2/hgnetv2_image_classifier.py +216 -0
keras_hub/src/models/hgnetv2/hgnetv2_image_classifier_preprocessor.py +14 -0
keras_hub/src/models/hgnetv2/hgnetv2_image_converter.py +8 -0
keras_hub/src/models/hgnetv2/hgnetv2_layers.py +918 -0
keras_hub/src/models/hgnetv2/hgnetv2_presets.py +58 -0
keras_hub/src/models/llama3/llama3_presets.py +3 -3
keras_hub/src/models/mistral/mistral_presets.py +17 -1
keras_hub/src/models/mixtral/mixtral_presets.py +2 -2
keras_hub/src/models/mobilenet/mobilenet_presets.py +4 -4
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +2 -2
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +2 -2
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +17 -17
keras_hub/src/models/qwen3/__init__.py +5 -0
keras_hub/src/models/qwen3/qwen3_attention.py +369 -0
keras_hub/src/models/qwen3/qwen3_backbone.py +191 -0
keras_hub/src/models/qwen3/qwen3_causal_lm.py +390 -0
keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor.py +10 -0
keras_hub/src/models/qwen3/qwen3_decoder.py +309 -0
keras_hub/src/models/qwen3/qwen3_layernorm.py +38 -0
keras_hub/src/models/qwen3/qwen3_presets.py +73 -0
keras_hub/src/models/qwen3/qwen3_tokenizer.py +48 -0
keras_hub/src/models/qwen_moe/qwen_moe_attention.py +1 -0
keras_hub/src/models/qwen_moe/qwen_moe_presets.py +2 -2
keras_hub/src/models/roformer_v2/roformer_v2_attention.py +0 -2
keras_hub/src/models/stable_diffusion_3/flow_match_euler_discrete_scheduler.py +16 -7
keras_hub/src/models/stable_diffusion_3/mmdit.py +61 -4
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +31 -32
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +1 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +1 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +1 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +6 -2
keras_hub/src/models/vit/vit_backbone.py +31 -11
keras_hub/src/models/vit/vit_image_converter.py +0 -70
keras_hub/src/models/vit/vit_layers.py +33 -18
keras_hub/src/models/vit/vit_presets.py +11 -11
keras_hub/src/utils/keras_utils.py +17 -0
keras_hub/src/utils/preset_utils.py +19 -4
keras_hub/src/utils/tensor_utils.py +14 -0
keras_hub/src/utils/transformers/convert_deit.py +155 -0
keras_hub/src/utils/transformers/convert_dinov2.py +180 -0
keras_hub/src/utils/transformers/convert_esm.py +159 -0
keras_hub/src/utils/transformers/convert_llama3.py +6 -0
keras_hub/src/utils/transformers/convert_qwen3.py +145 -0
keras_hub/src/utils/transformers/export/gemma.py +89 -0
keras_hub/src/utils/transformers/export/hf_exporter.py +98 -0
keras_hub/src/utils/transformers/preset_loader.py +14 -2
keras_hub/src/version.py +1 -1
keras_hub/tokenizers/__init__.py +1 -0
{keras_hub-0.21.1.dev0.dist-info → keras_hub-0.22.0.dev0.dist-info}/METADATA +4 -4
{keras_hub-0.21.1.dev0.dist-info → keras_hub-0.22.0.dev0.dist-info}/RECORD +92 -48
keras_hub/src/models/clip/clip_encoder_block.py +0 -111
keras_hub/src/models/clip/clip_vision_embedding.py +0 -101
{keras_hub-0.21.1.dev0.dist-info → keras_hub-0.22.0.dev0.dist-info}/WHEEL +0 -0
{keras_hub-0.21.1.dev0.dist-info → keras_hub-0.22.0.dev0.dist-info}/top_level.txt +0 -0

keras_hub/src/models/clip/clip_layers.py ADDED Viewed

@@ -0,0 +1,295 @@
+import math
+from keras import layers
+from keras import ops
+from keras_hub.src.utils.keras_utils import standardize_data_format
+def quick_gelu(x):
+    return x * ops.sigmoid(1.702 * x)
+class CLIPVisionEmbedding(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        patch_size,
+        image_size,
+        data_format=None,
+        dtype=None,
+        **kwargs,
+    ):
+        super().__init__(dtype=dtype, **kwargs)
+        self.hidden_dim = int(hidden_dim)
+        self.patch_size = int(patch_size)
+        self.image_size = int(image_size)
+        data_format = standardize_data_format(data_format)
+        self.data_format = data_format
+        num_patches = (image_size // patch_size) ** 2
+        self.num_positions = num_patches + 1
+        self.patch_embedding = layers.Conv2D(
+            hidden_dim,
+            kernel_size=patch_size,
+            strides=patch_size,
+            data_format=data_format,
+            use_bias=False,
+            dtype=dtype,
+            name="patch_embedding",
+        )
+        self.position_embedding = layers.Embedding(
+            num_patches + 1, hidden_dim, dtype=dtype, name="position_embedding"
+        )
+    def build(self, input_shape):
+        self.class_embedding = self.add_weight(
+            shape=(self.hidden_dim,),
+            initializer="random_normal",
+            dtype=self.variable_dtype,
+            name="class_embedding",
+        )
+        self.position_ids = self.add_weight(
+            shape=(1, self.num_positions),
+            initializer="zeros",
+            # Let the backend determine the int dtype. For example, tf
+            # requires int64 for correct device placement, whereas jax and torch
+            # don't.
+            dtype=int,
+            trainable=False,
+            name="position_ids",
+        )
+        self.patch_embedding.build(input_shape)
+        self.position_embedding.build(self.position_ids.shape)
+    def call(self, inputs, training=None):
+        x = inputs
+        batch_size = ops.shape(x)[0]
+        patch_embeddings = self.patch_embedding(x, training=training)
+        if self.data_format == "channels_last":
+            patch_embeddings = ops.reshape(
+                patch_embeddings, (batch_size, -1, self.hidden_dim)
+            )
+        else:
+            patch_embeddings = ops.reshape(
+                patch_embeddings, (batch_size, self.hidden_dim, -1)
+            )
+            patch_embeddings = ops.transpose(patch_embeddings, (0, 2, 1))
+        class_embeddings = ops.expand_dims(self.class_embedding, axis=(0, 1))
+        class_embeddings = ops.tile(class_embeddings, (batch_size, 1, 1))
+        position_embeddings = self.position_embedding(self.position_ids)
+        embeddings = ops.concatenate(
+            [class_embeddings, patch_embeddings], axis=1
+        )
+        return ops.add(embeddings, position_embeddings)
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "patch_size": self.patch_size,
+                "image_size": self.image_size,
+            }
+        )
+        return config
+    def compute_output_shape(self, input_shape):
+        output_shape = [input_shape[0], None, self.hidden_dim]
+        if self.data_format == "channels_last":
+            if input_shape[1] is not None and input_shape[2] is not None:
+                patch_num = input_shape[1] // self.patch_size
+                output_shape[1] = patch_num**2 + 1
+        else:
+            if input_shape[2] is not None and input_shape[3] is not None:
+                patch_num = input_shape[2] // self.patch_size
+                output_shape[1] = patch_num**2 + 1
+        return output_shape
+class CLIPEncoderLayer(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        num_heads,
+        intermediate_dim,
+        intermediate_activation="quick_gelu",
+        use_causal_mask=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if hidden_dim % num_heads != 0:
+            raise ValueError(
+                "`hidden_dim` must be divisible by `num_heads`. "
+                f"Received: hidden_dim={hidden_dim}, num_heads={num_heads}"
+            )
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.intermediate_dim = intermediate_dim
+        self.intermediate_activation = intermediate_activation
+        self.use_causal_mask = use_causal_mask
+        if intermediate_activation == "quick_gelu":
+            intermediate_activation = quick_gelu
+        self.layer_norm_1 = layers.LayerNormalization(
+            epsilon=1e-5, dtype=self.dtype_policy, name="layer_norm_1"
+        )
+        self.attention = layers.MultiHeadAttention(
+            num_heads,
+            hidden_dim // num_heads,
+            dtype=self.dtype_policy,
+            name="attention",
+        )
+        self.layer_norm_2 = layers.LayerNormalization(
+            epsilon=1e-5, dtype=self.dtype_policy, name="layer_norm_2"
+        )
+        self.dense_1 = layers.Dense(
+            self.intermediate_dim, dtype=self.dtype_policy, name="dense_1"
+        )
+        self.activation = layers.Activation(
+            intermediate_activation, dtype=self.dtype_policy, name="activation"
+        )
+        self.dense_2 = layers.Dense(
+            self.hidden_dim, dtype=self.dtype_policy, name="dense_2"
+        )
+    def build(self, input_shape):
+        self.layer_norm_1.build(input_shape)
+        self.attention.build(input_shape, input_shape, input_shape)
+        self.layer_norm_2.build(input_shape)
+        self.dense_1.build(input_shape)
+        input_shape = self.dense_1.compute_output_shape(input_shape)
+        self.dense_2.build(input_shape)
+    def compute_output_shape(self, inputs_shape):
+        outputs_shape = list(inputs_shape)
+        outputs_shape[-1] = self.hidden_dim
+        return outputs_shape
+    def call(self, x, training=None):
+        residual = x
+        x = self.layer_norm_1(x)
+        x = self.attention(
+            x, x, x, training=training, use_causal_mask=self.use_causal_mask
+        )
+        x = ops.add(residual, x)
+        residual = x
+        x = self.dense_1(self.layer_norm_2(residual))
+        x = self.activation(x)
+        x = self.dense_2(x)
+        x = ops.add(residual, x)
+        return x
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "num_heads": self.num_heads,
+                "intermediate_dim": self.intermediate_dim,
+                "intermediate_activation": self.intermediate_activation,
+                "use_causal_mask": self.use_causal_mask,
+            }
+        )
+        return config
+class CLIPVisionPooler(layers.Layer):
+    """The vision pooler layer of CLIP.
+    `CLIPVisionPooler` will extracts the first token (index `0`) from the
+    sequence of the vision embeddings as the pooled outputs.
+    Call arguments:
+        vision_embeddings: A tensor of shape
+            `(batch_size, sequence_length, hidden_dim)`.
+    """
+    def call(self, vision_embeddings):
+        return vision_embeddings[:, 0, :]
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1])
+class CLIPTextPooler(layers.Layer):
+    """The text pooler layer of CLIP.
+    `CLIPTextPooler` extracts the text embeddings at the positions of EOS tokens
+    as the pooled outputs.
+    Call arguments:
+        text_embeddings: A tensor of shape
+            `(batch_size, sequence_length, hidden_dim)`.
+        token_ids: A tensor of shape `(batch_size, max_tokens)`, used to
+            identify the positions of EOS tokens.
+    """
+    def call(self, text_embeddings, token_ids):
+        # `keepdims` is not supported in `keras<=3.1`.
+        eos_index = ops.argmax(token_ids, axis=-1)
+        eos_index = ops.expand_dims(eos_index, axis=-1)
+        eos_index = ops.expand_dims(eos_index, axis=-1)
+        pooled_outputs = ops.take_along_axis(text_embeddings, eos_index, axis=1)
+        return ops.squeeze(pooled_outputs, axis=1)
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1])
+class CLIPHead(layers.Layer):
+    """The head layer of CLIP.
+    `CLIPHead` takes `vision_embedding` and `text_embedding` as inputs to
+    compute the corresponding logits. Both embeddings are L2 normalized and used
+    to compute pairwise cosine similarity. The resulting logits are then scaled
+    by a learnable `logit_scale` parameter.
+    Call arguments:
+        vision_embedding: A tensor of shape `(batch_size, hidden_dim)`.
+        text_embedding: A tensor of shape `(batch_size, hidden_dim)`.
+    """
+    def build(self, input_shape):
+        self.logit_scale = self.add_weight(
+            shape=(),
+            initializer=lambda *a, **kw: math.log(1 / 0.07),
+            trainable=True,
+            dtype=self.variable_dtype,
+            name="logit_scale",
+        )
+    def call(self, vision_embedding, text_embedding):
+        normalized_vision_embedding = ops.sqrt(
+            ops.sum(ops.power(vision_embedding, 2), axis=-1, keepdims=True)
+        )
+        normalized_text_embedding = ops.sqrt(
+            ops.sum(ops.power(text_embedding, 2), axis=-1, keepdims=True)
+        )
+        vision_embedding = vision_embedding / normalized_vision_embedding
+        text_embedding = text_embedding / normalized_text_embedding
+        logit_scale = ops.exp(self.logit_scale)
+        text_logits = (
+            ops.matmul(
+                text_embedding,
+                ops.transpose(vision_embedding),
+            )
+            * logit_scale
+        )
+        vision_logits = ops.transpose(text_logits)
+        return vision_logits, text_logits
+    def compute_output_shape(
+        self, vision_embedding_shape, text_embedding_shape
+    ):
+        vision_logits_shape = (
+            vision_embedding_shape[0],
+            text_embedding_shape[0],
+        )
+        text_logits_shape = (
+            text_embedding_shape[0],
+            vision_embedding_shape[0],
+        )
+        return vision_logits_shape, text_logits_shape

keras_hub/src/models/clip/clip_preprocessor.py CHANGED Viewed

@@ -2,8 +2,10 @@ import keras
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
+from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
+from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
+from keras_hub.src.models.clip.clip_image_converter import CLIPImageConverter
 from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
-from keras_hub.src.models.preprocessor import Preprocessor
 from keras_hub.src.utils.tensor_utils import preprocessing_function
 try:
@@ -13,32 +15,18 @@ except ImportError:
 @keras_hub_export("keras_hub.models.CLIPPreprocessor")
-class CLIPPreprocessor(Preprocessor):
-    """CLIP preprocessing layer which tokenizes and packs inputs.
+class CLIPPreprocessor(CausalLMPreprocessor):
+    """CLIP preprocessor.
     This preprocessing layer will do 2 things:
-    - Tokenize the inputs using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`.
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-    `CLIPPreprocessor` forces the input to have only one segment, as CLIP is
-    mainly used for generation tasks. For tasks having multi-segment inputs
-    like "glue/mnli", please use a model designed for classification purposes
-    such as BERT or RoBERTa.
+    This preprocessing layer is meant for use with
+    `keras_hub.models.CLIPBackbone`. By default, it will take in batches of
+    strings and images, and return token ids and resized images.
     Args:
         tokenizer: A `keras_hub.models.CLIPTokenizer` instance.
+        image_converter: A `keras_hub.models.CLIPImageConverter` instance.
         sequence_length: The length of the packed inputs.
         add_start_token: If `True`, the preprocessor will prepend the tokenizer
             start token to each input sequence.
@@ -47,32 +35,62 @@ class CLIPPreprocessor(Preprocessor):
         to_lower: bool. Whether to lower the inputs.
     Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
+        x: A dict with `"prompts"` and `"images"` keys, where `"prompts"` is
+            `tf.Tensor` or list of python strings and `"images"` are the image
+            tensors.
+        y: Label data. Should always be `None` since SigLIP doesn't need the
+            label to calculate the loss.
+        sample_weight: Label weights.
         sequence_length: Pass to override the configured `sequence_length` of
             the layer.
-    """
-    # TODO: Add example once we have a CLIP model.
+    Examples:
+    ```python
+    # Load the preprocessor from a preset.
+    preprocessor = keras_hub.models.CLIPPreprocessor.from_preset(
+        "clip_vit_base_patch16"
+    )
+    # Tokenize the sentence and preprocess the image.
+    preprocessor(
+        {
+            "prompts": "The quick brown fox jumped.",
+            "images": np.ones(shape=(123, 123, 3)),
+        }
+    )
+    # Tokenize a batch of sentences and preprocess a batch of images.
+    preprocessor(
+        {
+            "prompts": ["The quick brown fox jumped.", "The fox slept."],
+            "images": np.ones(shape=(2, 123, 123, 3)),
+        }
+    )
+    ```
+    """
+    backbone_cls = CLIPBackbone
     tokenizer_cls = CLIPTokenizer
+    image_converter_cls = CLIPImageConverter
     def __init__(
         self,
         tokenizer,
+        image_converter=None,
         sequence_length=77,
         add_start_token=True,
         add_end_token=True,
         to_lower=True,
         **kwargs,
     ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.sequence_length = sequence_length
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
+        super().__init__(
+            tokenizer=tokenizer,
+            sequence_length=sequence_length,
+            add_start_token=add_start_token,
+            add_end_token=add_end_token,
+            **kwargs,
+        )
+        self.image_converter = image_converter
         self.to_lower = to_lower
     def build(self, input_shape):
@@ -96,10 +114,14 @@ class CLIPPreprocessor(Preprocessor):
         sequence_length=None,
     ):
         sequence_length = sequence_length or self.sequence_length
+        images, prompts = x["images"], x["prompts"]
         if self.to_lower:
-            x = tf.strings.lower(x)
+            prompts = tf.strings.lower(prompts)
+        prompts = self.tokenizer(prompts)
+        if images is not None and self.image_converter:
+            images = self.image_converter(images)
         token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
+            prompts,
             sequence_length=sequence_length,
             add_start_value=self.add_start_token,
             add_end_value=self.add_end_token,
@@ -107,6 +129,7 @@ class CLIPPreprocessor(Preprocessor):
         x = {
             "token_ids": token_ids,
             "padding_mask": padding_mask,
+            "images": images,
         }
         return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
@@ -114,21 +137,7 @@ class CLIPPreprocessor(Preprocessor):
         config = super().get_config()
         config.update(
             {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
                 "to_lower": self.to_lower,
             }
         )
         return config
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value

keras_hub/src/models/clip/clip_text_encoder.py CHANGED Viewed

@@ -5,7 +5,7 @@ from keras_hub.src.layers.modeling.token_and_position_embedding import (
     TokenAndPositionEmbedding,
 )
 from keras_hub.src.models.backbone import Backbone
-from keras_hub.src.models.clip.clip_encoder_block import CLIPEncoderBlock
+from keras_hub.src.models.clip.clip_layers import CLIPEncoderLayer
 @keras_hub_export("keras_hub.models.CLIPTextEncoder")
@@ -71,7 +71,7 @@ class CLIPTextEncoder(Backbone):
             name=f"{prefix}embedding",
         )
         self.encoder_layers = [
-            CLIPEncoderBlock(
+            CLIPEncoderLayer(
                 hidden_dim,
                 num_heads,
                 intermediate_dim,

keras_hub/src/models/clip/clip_vision_encoder.py CHANGED Viewed

@@ -2,8 +2,8 @@ from keras import layers
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.backbone import Backbone
-from keras_hub.src.models.clip.clip_encoder_block import CLIPEncoderBlock
-from keras_hub.src.models.clip.clip_vision_embedding import CLIPVisionEmbedding
+from keras_hub.src.models.clip.clip_layers import CLIPEncoderLayer
+from keras_hub.src.models.clip.clip_layers import CLIPVisionEmbedding
 from keras_hub.src.utils.keras_utils import standardize_data_format
@@ -91,7 +91,7 @@ class CLIPVisionEncoder(Backbone):
             epsilon=1e-5, dtype=dtype, name=f"{prefix}pre_layer_norm"
         )
         self.encoder_layers = [
-            CLIPEncoderBlock(
+            CLIPEncoderLayer(
                 hidden_dim,
                 num_heads,
                 intermediate_dim,

keras_hub/src/models/deit/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from keras_hub.src.models.deit.deit_backbone import DeiTBackbone
+from keras_hub.src.models.deit.deit_presets import backbone_presets
+from keras_hub.src.utils.preset_utils import register_presets
+register_presets(backbone_presets, DeiTBackbone)

keras_hub/src/models/deit/deit_backbone.py ADDED Viewed

@@ -0,0 +1,154 @@
+import keras
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.backbone import Backbone
+from keras_hub.src.models.deit.deit_layers import DeiTEmbeddings
+from keras_hub.src.models.deit.deit_layers import DeiTEncoder
+from keras_hub.src.utils.keras_utils import standardize_data_format
+@keras_hub_export("keras_hub.models.DeiTBackbone")
+class DeiTBackbone(Backbone):
+    """DeiT backbone.
+    This backbone implements the Data-efficient Image Transformer (DeiT)
+    architecture as described in [Training data-efficient image
+    transformers & distillation through attention]
+    (https://arxiv.org/abs/2012.12877).
+    Args:
+        image_shape: A tuple or list of 3 integers representing the shape of the
+            input image `(height, width, channels)`.
+        patch_size: tuple or int. The size of each image patch. If an int is
+            provided, it will be used for both height and width. The input image
+            will be split into patches of shape `(patch_size_h, patch_size_w)`.
+        num_layers: int. The number of transformer encoder layers.
+        num_heads: int. The number of attention heads in each Transformer
+            encoder layer.
+        hidden_dim: int. The dimensionality of the hidden representations.
+        intermediate_dim: int. The dimensionality of the intermediate MLP layer
+            in each Transformer encoder layer.
+        dropout_rate: float. The dropout rate for the Transformer encoder
+            layers.
+        attention_dropout: float. The dropout rate for the attention mechanism
+            in each Transformer encoder layer.
+        layer_norm_epsilon: float. Value used for numerical stability in layer
+            normalization.
+        use_mha_bias: bool. Whether to use bias in the multi-head attention
+            layers.
+        data_format: str. `"channels_last"` or `"channels_first"`, specifying
+            the data format for the input image. If `None`, defaults to
+            `"channels_last"`.
+        dtype: The dtype of the layer weights. Defaults to None.
+        **kwargs: Additional keyword arguments to be passed to the parent
+            `Backbone` class.
+    """
+    def __init__(
+        self,
+        image_shape,
+        patch_size,
+        num_layers,
+        num_heads,
+        hidden_dim,
+        intermediate_dim,
+        dropout_rate=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-6,
+        use_mha_bias=True,
+        data_format=None,
+        dtype=None,
+        **kwargs,
+    ):
+        # === Laters ===
+        data_format = standardize_data_format(data_format)
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        h_axis, w_axis, channels_axis = (
+            (-3, -2, -1) if data_format == "channels_last" else (-2, -1, -3)
+        )
+        # Check that the input image is well specified.
+        if image_shape[h_axis] is None or image_shape[w_axis] is None:
+            raise ValueError(
+                f"Image shape must have defined height and width. Found `None` "
+                f"at index {h_axis} (height) or {w_axis} (width). "
+                f"Image shape: {image_shape}"
+            )
+        # Check that image dimensions be divisible by patch size
+        if image_shape[h_axis] % patch_size[0] != 0:
+            raise ValueError(
+                f"Input height {image_shape[h_axis]} should be divisible by "
+                f"patch size {patch_size}."
+            )
+        if image_shape[w_axis] % patch_size[1] != 0:
+            raise ValueError(
+                f"Input height {image_shape[w_axis]} should be divisible by "
+                f"patch size {patch_size}."
+            )
+        num_channels = image_shape[channels_axis]
+        # === Functional Model ===
+        inputs = keras.layers.Input(shape=image_shape)
+        x = DeiTEmbeddings(
+            image_size=(image_shape[h_axis], image_shape[w_axis]),
+            patch_size=patch_size,
+            hidden_dim=hidden_dim,
+            num_channels=num_channels,
+            data_format=data_format,
+            dropout_rate=dropout_rate,
+            dtype=dtype,
+            name="deit_patching_and_embedding",
+        )(inputs)
+        output, _, _ = DeiTEncoder(
+            num_layers=num_layers,
+            num_heads=num_heads,
+            hidden_dim=hidden_dim,
+            intermediate_dim=intermediate_dim,
+            use_mha_bias=use_mha_bias,
+            dropout_rate=dropout_rate,
+            attention_dropout=attention_dropout,
+            layer_norm_epsilon=layer_norm_epsilon,
+            dtype=dtype,
+            name="deit_encoder",
+        )(x)
+        super().__init__(
+            inputs=inputs,
+            outputs=output,
+            dtype=dtype,
+            **kwargs,
+        )
+        # === Config ===
+        self.image_shape = image_shape
+        self.patch_size = patch_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.intermediate_dim = intermediate_dim
+        self.dropout_rate = dropout_rate
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.use_mha_bias = use_mha_bias
+        self.data_format = data_format
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "image_shape": self.image_shape,
+                "patch_size": self.patch_size,
+                "num_layers": self.num_layers,
+                "num_heads": self.num_heads,
+                "hidden_dim": self.hidden_dim,
+                "intermediate_dim": self.intermediate_dim,
+                "dropout_rate": self.dropout_rate,
+                "attention_dropout": self.attention_dropout,
+                "layer_norm_epsilon": self.layer_norm_epsilon,
+                "use_mha_bias": self.use_mha_bias,
+            }
+        )
+        return config

keras-hub 0.21.1.dev0__py3-none-any.whl → 0.22.0.dev0__py3-none-any.whl

keras-hub 0.21.1.dev0py3-none-any.whl → 0.22.0.dev0py3-none-any.whl