PyPI - keras-hub-nightly - Versions diffs - 0.22.0.dev202507150421__py3-none-any.whl → 0.22.0.dev202507170424__py3-none-any.whl - Mend

keras-hub-nightly 0.22.0.dev202507150421py3-none-any.whl → 0.22.0.dev202507170424py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

keras_hub/layers/__init__.py CHANGED Viewed

@@ -84,6 +84,9 @@ from keras_hub.src.models.deit.deit_image_converter import (
 from keras_hub.src.models.densenet.densenet_image_converter import (
     DenseNetImageConverter as DenseNetImageConverter,
 )
+from keras_hub.src.models.dinov2.dinov2_image_converter import (
+    DINOV2ImageConverter as DINOV2ImageConverter,
+)
 from keras_hub.src.models.efficientnet.efficientnet_image_converter import (
     EfficientNetImageConverter as EfficientNetImageConverter,
 )

keras_hub/models/__init__.py CHANGED Viewed

@@ -157,6 +157,9 @@ from keras_hub.src.models.densenet.densenet_image_classifier import (
 from keras_hub.src.models.densenet.densenet_image_classifier_preprocessor import (
     DenseNetImageClassifierPreprocessor as DenseNetImageClassifierPreprocessor,
 )
+from keras_hub.src.models.dinov2.dinov2_backbone import (
+    DINOV2Backbone as DINOV2Backbone,
+)
 from keras_hub.src.models.distil_bert.distil_bert_backbone import (
     DistilBertBackbone as DistilBertBackbone,
 )

keras_hub/src/models/clip/clip_backbone.py CHANGED Viewed

@@ -1,109 +1,10 @@
-import math
 from keras import layers
-from keras import ops
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.backbone import Backbone
-class CLIPVisionPooler(layers.Layer):
-    """The vision pooler layer of CLIP.
-    `CLIPVisionPooler` will extracts the first token (index `0`) from the
-    sequence of the vision embeddings as the pooled outputs.
-    Call arguments:
-        vision_embeddings: A tensor of shape
-            `(batch_size, sequence_length, hidden_dim)`.
-    """
-    def call(self, vision_embeddings):
-        return vision_embeddings[:, 0, :]
-    def compute_output_shape(self, input_shape):
-        return (input_shape[0], input_shape[-1])
-class CLIPTextPooler(layers.Layer):
-    """The text pooler layer of CLIP.
-    `CLIPTextPooler` extracts the text embeddings at the positions of EOS tokens
-    as the pooled outputs.
-    Call arguments:
-        text_embeddings: A tensor of shape
-            `(batch_size, sequence_length, hidden_dim)`.
-        token_ids: A tensor of shape `(batch_size, max_tokens)`, used to
-            identify the positions of EOS tokens.
-    """
-    def call(self, text_embeddings, token_ids):
-        # `keepdims` is not supported in `keras<=3.1`.
-        eos_index = ops.argmax(token_ids, axis=-1)
-        eos_index = ops.expand_dims(eos_index, axis=-1)
-        eos_index = ops.expand_dims(eos_index, axis=-1)
-        pooled_outputs = ops.take_along_axis(text_embeddings, eos_index, axis=1)
-        return ops.squeeze(pooled_outputs, axis=1)
-    def compute_output_shape(self, input_shape):
-        return (input_shape[0], input_shape[-1])
-class CLIPHead(layers.Layer):
-    """The head layer of CLIP.
-    `CLIPHead` takes `vision_embedding` and `text_embedding` as inputs to
-    compute the corresponding logits. Both embeddings are L2 normalized and used
-    to compute pairwise cosine similarity. The resulting logits are then scaled
-    by a learnable `logit_scale` parameter.
-    Call arguments:
-        vision_embedding: A tensor of shape `(batch_size, hidden_dim)`.
-        text_embedding: A tensor of shape `(batch_size, hidden_dim)`.
-    """
-    def build(self, input_shape):
-        self.logit_scale = self.add_weight(
-            shape=(),
-            initializer=lambda *a, **kw: math.log(1 / 0.07),
-            trainable=True,
-            dtype=self.variable_dtype,
-            name="logit_scale",
-        )
-    def call(self, vision_embedding, text_embedding):
-        normalized_vision_embedding = ops.sqrt(
-            ops.sum(ops.power(vision_embedding, 2), axis=-1, keepdims=True)
-        )
-        normalized_text_embedding = ops.sqrt(
-            ops.sum(ops.power(text_embedding, 2), axis=-1, keepdims=True)
-        )
-        vision_embedding = vision_embedding / normalized_vision_embedding
-        text_embedding = text_embedding / normalized_text_embedding
-        logit_scale = ops.exp(self.logit_scale)
-        text_logits = (
-            ops.matmul(
-                text_embedding,
-                ops.transpose(vision_embedding),
-            )
-            * logit_scale
-        )
-        vision_logits = ops.transpose(text_logits)
-        return vision_logits, text_logits
-    def compute_output_shape(
-        self, vision_embedding_shape, text_embedding_shape
-    ):
-        vision_logits_shape = (
-            vision_embedding_shape[0],
-            text_embedding_shape[0],
-        )
-        text_logits_shape = (
-            text_embedding_shape[0],
-            vision_embedding_shape[0],
-        )
-        return vision_logits_shape, text_logits_shape
+from keras_hub.src.models.clip.clip_layers import CLIPHead
+from keras_hub.src.models.clip.clip_layers import CLIPTextPooler
+from keras_hub.src.models.clip.clip_layers import CLIPVisionPooler
 @keras_hub_export("keras_hub.models.CLIPBackbone")

keras_hub/src/models/clip/clip_layers.py ADDED Viewed

@@ -0,0 +1,295 @@
+import math
+from keras import layers
+from keras import ops
+from keras_hub.src.utils.keras_utils import standardize_data_format
+def quick_gelu(x):
+    return x * ops.sigmoid(1.702 * x)
+class CLIPVisionEmbedding(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        patch_size,
+        image_size,
+        data_format=None,
+        dtype=None,
+        **kwargs,
+    ):
+        super().__init__(dtype=dtype, **kwargs)
+        self.hidden_dim = int(hidden_dim)
+        self.patch_size = int(patch_size)
+        self.image_size = int(image_size)
+        data_format = standardize_data_format(data_format)
+        self.data_format = data_format
+        num_patches = (image_size // patch_size) ** 2
+        self.num_positions = num_patches + 1
+        self.patch_embedding = layers.Conv2D(
+            hidden_dim,
+            kernel_size=patch_size,
+            strides=patch_size,
+            data_format=data_format,
+            use_bias=False,
+            dtype=dtype,
+            name="patch_embedding",
+        )
+        self.position_embedding = layers.Embedding(
+            num_patches + 1, hidden_dim, dtype=dtype, name="position_embedding"
+        )
+    def build(self, input_shape):
+        self.class_embedding = self.add_weight(
+            shape=(self.hidden_dim,),
+            initializer="random_normal",
+            dtype=self.variable_dtype,
+            name="class_embedding",
+        )
+        self.position_ids = self.add_weight(
+            shape=(1, self.num_positions),
+            initializer="zeros",
+            # Let the backend determine the int dtype. For example, tf
+            # requires int64 for correct device placement, whereas jax and torch
+            # don't.
+            dtype=int,
+            trainable=False,
+            name="position_ids",
+        )
+        self.patch_embedding.build(input_shape)
+        self.position_embedding.build(self.position_ids.shape)
+    def call(self, inputs, training=None):
+        x = inputs
+        batch_size = ops.shape(x)[0]
+        patch_embeddings = self.patch_embedding(x, training=training)
+        if self.data_format == "channels_last":
+            patch_embeddings = ops.reshape(
+                patch_embeddings, (batch_size, -1, self.hidden_dim)
+            )
+        else:
+            patch_embeddings = ops.reshape(
+                patch_embeddings, (batch_size, self.hidden_dim, -1)
+            )
+            patch_embeddings = ops.transpose(patch_embeddings, (0, 2, 1))
+        class_embeddings = ops.expand_dims(self.class_embedding, axis=(0, 1))
+        class_embeddings = ops.tile(class_embeddings, (batch_size, 1, 1))
+        position_embeddings = self.position_embedding(self.position_ids)
+        embeddings = ops.concatenate(
+            [class_embeddings, patch_embeddings], axis=1
+        )
+        return ops.add(embeddings, position_embeddings)
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "patch_size": self.patch_size,
+                "image_size": self.image_size,
+            }
+        )
+        return config
+    def compute_output_shape(self, input_shape):
+        output_shape = [input_shape[0], None, self.hidden_dim]
+        if self.data_format == "channels_last":
+            if input_shape[1] is not None and input_shape[2] is not None:
+                patch_num = input_shape[1] // self.patch_size
+                output_shape[1] = patch_num**2 + 1
+        else:
+            if input_shape[2] is not None and input_shape[3] is not None:
+                patch_num = input_shape[2] // self.patch_size
+                output_shape[1] = patch_num**2 + 1
+        return output_shape
+class CLIPEncoderLayer(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        num_heads,
+        intermediate_dim,
+        intermediate_activation="quick_gelu",
+        use_causal_mask=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if hidden_dim % num_heads != 0:
+            raise ValueError(
+                "`hidden_dim` must be divisible by `num_heads`. "
+                f"Received: hidden_dim={hidden_dim}, num_heads={num_heads}"
+            )
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.intermediate_dim = intermediate_dim
+        self.intermediate_activation = intermediate_activation
+        self.use_causal_mask = use_causal_mask
+        if intermediate_activation == "quick_gelu":
+            intermediate_activation = quick_gelu
+        self.layer_norm_1 = layers.LayerNormalization(
+            epsilon=1e-5, dtype=self.dtype_policy, name="layer_norm_1"
+        )
+        self.attention = layers.MultiHeadAttention(
+            num_heads,
+            hidden_dim // num_heads,
+            dtype=self.dtype_policy,
+            name="attention",
+        )
+        self.layer_norm_2 = layers.LayerNormalization(
+            epsilon=1e-5, dtype=self.dtype_policy, name="layer_norm_2"
+        )
+        self.dense_1 = layers.Dense(
+            self.intermediate_dim, dtype=self.dtype_policy, name="dense_1"
+        )
+        self.activation = layers.Activation(
+            intermediate_activation, dtype=self.dtype_policy, name="activation"
+        )
+        self.dense_2 = layers.Dense(
+            self.hidden_dim, dtype=self.dtype_policy, name="dense_2"
+        )
+    def build(self, input_shape):
+        self.layer_norm_1.build(input_shape)
+        self.attention.build(input_shape, input_shape, input_shape)
+        self.layer_norm_2.build(input_shape)
+        self.dense_1.build(input_shape)
+        input_shape = self.dense_1.compute_output_shape(input_shape)
+        self.dense_2.build(input_shape)
+    def compute_output_shape(self, inputs_shape):
+        outputs_shape = list(inputs_shape)
+        outputs_shape[-1] = self.hidden_dim
+        return outputs_shape
+    def call(self, x, training=None):
+        residual = x
+        x = self.layer_norm_1(x)
+        x = self.attention(
+            x, x, x, training=training, use_causal_mask=self.use_causal_mask
+        )
+        x = ops.add(residual, x)
+        residual = x
+        x = self.dense_1(self.layer_norm_2(residual))
+        x = self.activation(x)
+        x = self.dense_2(x)
+        x = ops.add(residual, x)
+        return x
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "num_heads": self.num_heads,
+                "intermediate_dim": self.intermediate_dim,
+                "intermediate_activation": self.intermediate_activation,
+                "use_causal_mask": self.use_causal_mask,
+            }
+        )
+        return config
+class CLIPVisionPooler(layers.Layer):
+    """The vision pooler layer of CLIP.
+    `CLIPVisionPooler` will extracts the first token (index `0`) from the
+    sequence of the vision embeddings as the pooled outputs.
+    Call arguments:
+        vision_embeddings: A tensor of shape
+            `(batch_size, sequence_length, hidden_dim)`.
+    """
+    def call(self, vision_embeddings):
+        return vision_embeddings[:, 0, :]
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1])
+class CLIPTextPooler(layers.Layer):
+    """The text pooler layer of CLIP.
+    `CLIPTextPooler` extracts the text embeddings at the positions of EOS tokens
+    as the pooled outputs.
+    Call arguments:
+        text_embeddings: A tensor of shape
+            `(batch_size, sequence_length, hidden_dim)`.
+        token_ids: A tensor of shape `(batch_size, max_tokens)`, used to
+            identify the positions of EOS tokens.
+    """
+    def call(self, text_embeddings, token_ids):
+        # `keepdims` is not supported in `keras<=3.1`.
+        eos_index = ops.argmax(token_ids, axis=-1)
+        eos_index = ops.expand_dims(eos_index, axis=-1)
+        eos_index = ops.expand_dims(eos_index, axis=-1)
+        pooled_outputs = ops.take_along_axis(text_embeddings, eos_index, axis=1)
+        return ops.squeeze(pooled_outputs, axis=1)
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1])
+class CLIPHead(layers.Layer):
+    """The head layer of CLIP.
+    `CLIPHead` takes `vision_embedding` and `text_embedding` as inputs to
+    compute the corresponding logits. Both embeddings are L2 normalized and used
+    to compute pairwise cosine similarity. The resulting logits are then scaled
+    by a learnable `logit_scale` parameter.
+    Call arguments:
+        vision_embedding: A tensor of shape `(batch_size, hidden_dim)`.
+        text_embedding: A tensor of shape `(batch_size, hidden_dim)`.
+    """
+    def build(self, input_shape):
+        self.logit_scale = self.add_weight(
+            shape=(),
+            initializer=lambda *a, **kw: math.log(1 / 0.07),
+            trainable=True,
+            dtype=self.variable_dtype,
+            name="logit_scale",
+        )
+    def call(self, vision_embedding, text_embedding):
+        normalized_vision_embedding = ops.sqrt(
+            ops.sum(ops.power(vision_embedding, 2), axis=-1, keepdims=True)
+        )
+        normalized_text_embedding = ops.sqrt(
+            ops.sum(ops.power(text_embedding, 2), axis=-1, keepdims=True)
+        )
+        vision_embedding = vision_embedding / normalized_vision_embedding
+        text_embedding = text_embedding / normalized_text_embedding
+        logit_scale = ops.exp(self.logit_scale)
+        text_logits = (
+            ops.matmul(
+                text_embedding,
+                ops.transpose(vision_embedding),
+            )
+            * logit_scale
+        )
+        vision_logits = ops.transpose(text_logits)
+        return vision_logits, text_logits
+    def compute_output_shape(
+        self, vision_embedding_shape, text_embedding_shape
+    ):
+        vision_logits_shape = (
+            vision_embedding_shape[0],
+            text_embedding_shape[0],
+        )
+        text_logits_shape = (
+            text_embedding_shape[0],
+            vision_embedding_shape[0],
+        )
+        return vision_logits_shape, text_logits_shape

keras_hub/src/models/clip/clip_preprocessor.py CHANGED Viewed

@@ -2,8 +2,10 @@ import keras
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
+from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
+from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
+from keras_hub.src.models.clip.clip_image_converter import CLIPImageConverter
 from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
-from keras_hub.src.models.preprocessor import Preprocessor
 from keras_hub.src.utils.tensor_utils import preprocessing_function
 try:
@@ -13,32 +15,18 @@ except ImportError:
 @keras_hub_export("keras_hub.models.CLIPPreprocessor")
-class CLIPPreprocessor(Preprocessor):
-    """CLIP preprocessing layer which tokenizes and packs inputs.
+class CLIPPreprocessor(CausalLMPreprocessor):
+    """CLIP preprocessor.
     This preprocessing layer will do 2 things:
-    - Tokenize the inputs using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`.
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-    `CLIPPreprocessor` forces the input to have only one segment, as CLIP is
-    mainly used for generation tasks. For tasks having multi-segment inputs
-    like "glue/mnli", please use a model designed for classification purposes
-    such as BERT or RoBERTa.
+    This preprocessing layer is meant for use with
+    `keras_hub.models.CLIPBackbone`. By default, it will take in batches of
+    strings and images, and return token ids and resized images.
     Args:
         tokenizer: A `keras_hub.models.CLIPTokenizer` instance.
+        image_converter: A `keras_hub.models.CLIPImageConverter` instance.
         sequence_length: The length of the packed inputs.
         add_start_token: If `True`, the preprocessor will prepend the tokenizer
             start token to each input sequence.
@@ -47,32 +35,62 @@ class CLIPPreprocessor(Preprocessor):
         to_lower: bool. Whether to lower the inputs.
     Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
+        x: A dict with `"prompts"` and `"images"` keys, where `"prompts"` is
+            `tf.Tensor` or list of python strings and `"images"` are the image
+            tensors.
+        y: Label data. Should always be `None` since SigLIP doesn't need the
+            label to calculate the loss.
+        sample_weight: Label weights.
         sequence_length: Pass to override the configured `sequence_length` of
             the layer.
-    """
-    # TODO: Add example once we have a CLIP model.
+    Examples:
+    ```python
+    # Load the preprocessor from a preset.
+    preprocessor = keras_hub.models.CLIPPreprocessor.from_preset(
+        "clip_vit_base_patch16"
+    )
+    # Tokenize the sentence and preprocess the image.
+    preprocessor(
+        {
+            "prompts": "The quick brown fox jumped.",
+            "images": np.ones(shape=(123, 123, 3)),
+        }
+    )
+    # Tokenize a batch of sentences and preprocess a batch of images.
+    preprocessor(
+        {
+            "prompts": ["The quick brown fox jumped.", "The fox slept."],
+            "images": np.ones(shape=(2, 123, 123, 3)),
+        }
+    )
+    ```
+    """
+    backbone_cls = CLIPBackbone
     tokenizer_cls = CLIPTokenizer
+    image_converter_cls = CLIPImageConverter
     def __init__(
         self,
         tokenizer,
+        image_converter=None,
         sequence_length=77,
         add_start_token=True,
         add_end_token=True,
         to_lower=True,
         **kwargs,
     ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.sequence_length = sequence_length
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
+        super().__init__(
+            tokenizer=tokenizer,
+            sequence_length=sequence_length,
+            add_start_token=add_start_token,
+            add_end_token=add_end_token,
+            **kwargs,
+        )
+        self.image_converter = image_converter
         self.to_lower = to_lower
     def build(self, input_shape):
@@ -96,10 +114,14 @@ class CLIPPreprocessor(Preprocessor):
         sequence_length=None,
     ):
         sequence_length = sequence_length or self.sequence_length
+        images, prompts = x["images"], x["prompts"]
         if self.to_lower:
-            x = tf.strings.lower(x)
+            prompts = tf.strings.lower(prompts)
+        prompts = self.tokenizer(prompts)
+        if images is not None and self.image_converter:
+            images = self.image_converter(images)
         token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
+            prompts,
             sequence_length=sequence_length,
             add_start_value=self.add_start_token,
             add_end_value=self.add_end_token,
@@ -107,6 +129,7 @@ class CLIPPreprocessor(Preprocessor):
         x = {
             "token_ids": token_ids,
             "padding_mask": padding_mask,
+            "images": images,
         }
         return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
@@ -114,21 +137,7 @@ class CLIPPreprocessor(Preprocessor):
         config = super().get_config()
         config.update(
             {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
                 "to_lower": self.to_lower,
             }
         )
         return config
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value

keras_hub/src/models/clip/clip_text_encoder.py CHANGED Viewed

@@ -5,7 +5,7 @@ from keras_hub.src.layers.modeling.token_and_position_embedding import (
     TokenAndPositionEmbedding,
 )
 from keras_hub.src.models.backbone import Backbone
-from keras_hub.src.models.clip.clip_encoder_block import CLIPEncoderBlock
+from keras_hub.src.models.clip.clip_layers import CLIPEncoderLayer
 @keras_hub_export("keras_hub.models.CLIPTextEncoder")
@@ -71,7 +71,7 @@ class CLIPTextEncoder(Backbone):
             name=f"{prefix}embedding",
         )
         self.encoder_layers = [
-            CLIPEncoderBlock(
+            CLIPEncoderLayer(
                 hidden_dim,
                 num_heads,
                 intermediate_dim,

keras_hub/src/models/clip/clip_vision_encoder.py CHANGED Viewed

@@ -2,8 +2,8 @@ from keras import layers
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.backbone import Backbone
-from keras_hub.src.models.clip.clip_encoder_block import CLIPEncoderBlock
-from keras_hub.src.models.clip.clip_vision_embedding import CLIPVisionEmbedding
+from keras_hub.src.models.clip.clip_layers import CLIPEncoderLayer
+from keras_hub.src.models.clip.clip_layers import CLIPVisionEmbedding
 from keras_hub.src.utils.keras_utils import standardize_data_format
@@ -91,7 +91,7 @@ class CLIPVisionEncoder(Backbone):
             epsilon=1e-5, dtype=dtype, name=f"{prefix}pre_layer_norm"
         )
         self.encoder_layers = [
-            CLIPEncoderBlock(
+            CLIPEncoderLayer(
                 hidden_dim,
                 num_heads,
                 intermediate_dim,

keras_hub/src/models/dinov2/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from keras_hub.src.models.dinov2.dinov2_backbone import DINOV2Backbone
+from keras_hub.src.models.dinov2.dinov2_presets import backbone_presets
+from keras_hub.src.utils.preset_utils import register_presets
+register_presets(backbone_presets, DINOV2Backbone)

keras-hub-nightly 0.22.0.dev202507150421__py3-none-any.whl → 0.22.0.dev202507170424__py3-none-any.whl

keras-hub-nightly 0.22.0.dev202507150421py3-none-any.whl → 0.22.0.dev202507170424py3-none-any.whl