PyPI - keras-hub-nightly - Versions diffs - 0.20.0.dev202504030357__py3-none-any.whl → 0.21.0.dev202504050402__py3-none-any.whl - Mend

keras-hub-nightly 0.20.0.dev202504030357py3-none-any.whl → 0.21.0.dev202504050402py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

keras_hub/src/models/gemma3/gemma3_decoder_block.py CHANGED Viewed

@@ -14,16 +14,17 @@ from keras_hub.src.models.gemma3.rms_normalization import RMSNormalization
 class Gemma3DecoderBlock(keras.layers.Layer):
     """Transformer decoder layer for Gemma3.
-    This is different from Gemma and Gemma2 in several ways:
-    - `use_query_key_norm`: Applies RMS Norm on query, key.
-    - `rope_wavelength`: RoPE wavelength differs from local to global attention
-      layers.
-    - `rope_scaling_factor`: RoPE scaling factor differs from local to global
-      attention layers.
-    - `gate_dim_reduction`: In the gating layers, Gemma and Gemma2 reduce
-      intermediate dimension by 2. For Gemma3, no such reduction happens.
-    - Uses bidirectional attention for images, and causal for everything else.
+    This decoder layer is the same as the layer used for Gemma and Gemma2.
+    However, there are a few key differences. Firstly, image tokens have
+    bidirectional masking. Additionally, this layer exposes the following args:
+    `use_query_key_norm`: bool. If True, apply RMS normalization on query
+        and key. For Gemma3, this is True.
+    `rope_wavelength`: float. Configurable value for RoPE wavelength. Gemma3
+        uses 10K for local attention layers and 1M for global attention layers.
+    `gate_dim_reduction`: int. In the gating layers, the output dimension is
+        `intermediate_dim // gate_dim_reduction`. For Gemma and Gemma2, this
+        value is 2. For Gemma3, it is 1.
     """
     def __init__(
@@ -160,9 +161,10 @@ class Gemma3DecoderBlock(keras.layers.Layer):
         # Isometric
         return input_shape
-    def _compute_image_bidirectional_attention_mask(self, text_mask):
-        # text_mask is True for text, False for images. Shape of (bsz, seq_len).
-        bidirectional_mask = ops.logical_not(text_mask)
+    def _compute_image_bidirectional_attention_mask(self, vision_mask):
+        # vision_mask is False for text, True for images. Shape of
+        # (bsz, seq_len).
+        bidirectional_mask = vision_mask
         # Left pad with 0.
         padded_mask = ops.cast(
@@ -194,7 +196,7 @@ class Gemma3DecoderBlock(keras.layers.Layer):
         self,
         x,
         padding_mask,
-        text_mask,
+        vision_mask,
         cache,
         cache_update_index,
     ):
@@ -216,9 +218,9 @@ class Gemma3DecoderBlock(keras.layers.Layer):
         # Compute bidirectional mask (image tokens can attend to each other
         # in both directions, within the same image).
-        if text_mask is not None:
+        if vision_mask is not None:
             bidirectional_image_mask = (
-                self._compute_image_bidirectional_attention_mask(text_mask)
+                self._compute_image_bidirectional_attention_mask(vision_mask)
             )
             causal_mask = ops.logical_or(causal_mask, bidirectional_image_mask)
@@ -232,14 +234,15 @@ class Gemma3DecoderBlock(keras.layers.Layer):
         self,
         x,
         padding_mask=None,
-        text_mask=None,
+        vision_mask=None,
         cache=None,
         cache_update_index=0,
+        cache_update_mask=None,
     ):
-        # Note: `text_mask` is used only for Gemma33.
+        # Note: `vision_mask` is used only for Gemma3.
         normalized_x = self.pre_attention_norm(x)
         attention_mask = self._compute_attention_mask(
-            normalized_x, padding_mask, text_mask, cache, cache_update_index
+            normalized_x, padding_mask, vision_mask, cache, cache_update_index
         )
         if cache is not None:
             attention, new_cache = self.attention(
@@ -247,6 +250,7 @@ class Gemma3DecoderBlock(keras.layers.Layer):
                 attention_mask=attention_mask,
                 cache=cache,
                 cache_update_index=cache_update_index,
+                cache_update_mask=cache_update_mask,
             )
         else:
             attention = self.attention(

keras_hub/src/models/gemma3/gemma3_image_converter.py CHANGED Viewed

@@ -6,3 +6,9 @@ from keras_hub.src.models.gemma3.gemma3_backbone import Gemma3Backbone
 @keras_hub_export("keras_hub.layers.Gemma3ImageConverter")
 class Gemma3ImageConverter(ImageConverter):
     backbone_cls = Gemma3Backbone
+    def __init__(self, **kwargs):
+        # Always do image preprocessing in float32
+        kwargs.pop("dtype", None)
+        dtype = "float32"
+        super().__init__(dtype=dtype, **kwargs)

keras_hub/src/models/gemma3/gemma3_interleave_embeddings.py CHANGED Viewed

@@ -5,12 +5,17 @@ from keras import ops
 class Gemma3InterleaveEmbeddings(keras.layers.Layer):
     """Places image embeddings in the correct position in an embedding sequence.
+    For Gemma3, images can be in any position in the input sequence. In order
+    to do accomplish this, we have image placeholder tokens in the input
+    sequence. We fill up these positions with the image embeddings as returned
+    by the vision encoder.
     Args:
         num_vision_tokens_per_image: int. Number of soft tokens per image.
     """
-    def __init__(self, num_vision_tokens_per_image, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, num_vision_tokens_per_image, dtype=None, **kwargs):
+        super().__init__(dtype=dtype, **kwargs)
         self.num_vision_tokens_per_image = num_vision_tokens_per_image
@@ -19,12 +24,17 @@ class Gemma3InterleaveEmbeddings(keras.layers.Layer):
         Integrates image embeddings into a text embedding sequence.
         Args:
-            image_embeddings: Tensor of shape
-                `(batch_size * num_images_per_prompt,
-                num_vision_tokens_per_image, embedding_dim)`.
-            text_embeddings: Tensor of shape
-            `(batch_size, seq_length, embedding_dim)`.
-            text_mask: Boolean tensor of shape `(batch_size, seq_length)`.
+            image_embeddings: tensor. Image embeddings as returned by the
+                vision encoder (`Gemma3VisionEncoder`, usually). Shape:
+                `(batch_size * num_images_per_prompt, `
+                `num_vision_tokens_per_image, embedding_dim)`.
+            text_embeddings: tensor. Embeddings returned by the text embedding
+                layer. Shape: `(batch_size, seq_length, embedding_dim)`.
+            vision_indices:  tensor. Indexes into `text_embeddings`, used to
+                identify which places are supposed to be replaced by
+                `image_embeddings`. Shape:
+                `(batch_size,`
+                `num_images_per_prompt * num_vision_tokens_per_image)`.
         Returns:
             Tensor of shape `(batch_size, seq_length, embedding_dim)`
@@ -32,32 +42,62 @@ class Gemma3InterleaveEmbeddings(keras.layers.Layer):
         """
         batch_size, seq_length, embedding_dim = ops.shape(text_embeddings)
+        # `num_images` will be 0 for text only inputs, and
+        # `batch_size * max_images_per_prompt` if images are passed.
+        num_images = ops.shape(image_embeddings)[0]
-        # Flatten text embeddings, text mask and image embeddings.
+        # Flatten text embeddings, image embeddings and indices.
         flat_text_embeddings = ops.reshape(
             text_embeddings, (batch_size * seq_length, embedding_dim)
         )
-        # The image batch size might be different when we pass only text, i.e,
-        # it will be 0 for text-only.
-        image_batch_size = ops.shape(image_embeddings)[0]
+        # `flat_image_embeddings` is the `updates` tensor and should be of shape
+        # `(num_updates, embedding_dim)`.
         flat_image_embeddings = ops.reshape(
             image_embeddings,
             (
-                image_batch_size * self.num_vision_tokens_per_image,
+                num_images * self.num_vision_tokens_per_image,
                 embedding_dim,
             ),
         )
-        # Reconstruct embeddings.
+        # For vision indices, we need to add values such that the indices
+        # index into a flattened `text_embeddings`.
+        to_add = ops.multiply(
+            keras.ops.arange(batch_size, dtype="int32"), seq_length
+        )
+        to_add = ops.expand_dims(to_add, axis=-1)
+        vision_indices = ops.add(vision_indices, to_add)
+        # indices should be of shape `(num_updates, 1)`. `num_updates` is
+        # how many vision tokens there are to update.
         vision_indices_shape = ops.shape(vision_indices)
         flat_vision_indices = ops.reshape(
             vision_indices,
             (vision_indices_shape[0] * vision_indices_shape[1], 1),
         )
         indices = ops.cast(flat_vision_indices, "int32")
+        # Before reconstructing, store the 0th index so that we can restore it
+        # later.
+        zeroth_index_text_embeddings = ops.take(
+            flat_text_embeddings,
+            indices=ops.squeeze(to_add, axis=-1),
+            axis=0,
+        )
+        # Reconstruct embeddings
+        reconstructed_embedding = ops.scatter_update(
+            inputs=flat_text_embeddings,
+            indices=indices,
+            updates=flat_image_embeddings,
+        )
+        # Remember that we pad `vision_indices` with the 0th index. We need to
+        # restore the original value in the reconstructed embedding tensor.
         reconstructed_embedding = ops.scatter_update(
-            flat_text_embeddings, indices, flat_image_embeddings
+            inputs=reconstructed_embedding,
+            indices=to_add,
+            updates=zeroth_index_text_embeddings,
         )
         # Reshape to original dimensions

keras_hub/src/models/gemma3/gemma3_presets.py CHANGED Viewed

@@ -11,7 +11,7 @@ backbone_presets = {
             "params": 999885952,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_1b/1",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_1b/3",
     },
     "gemma3_instruct_1b": {
         "metadata": {
@@ -22,7 +22,7 @@ backbone_presets = {
             "params": 999885952,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_1b/1",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_1b/3",
     },
     "gemma3_4b_text": {
         "metadata": {
@@ -33,7 +33,7 @@ backbone_presets = {
             "params": 3880099328,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_4b_text/1",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_4b_text/2",
     },
     "gemma3_instruct_4b_text": {
         "metadata": {
@@ -44,7 +44,7 @@ backbone_presets = {
             "params": 3880099328,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_4b_text/2",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_4b_text/3",
     },
     "gemma3_12b_text": {
         "metadata": {
@@ -55,7 +55,7 @@ backbone_presets = {
             "params": 11765788416,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_12b_text/1",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_12b_text/2",
     },
     "gemma3_instruct_12b_text": {
         "metadata": {
@@ -66,7 +66,7 @@ backbone_presets = {
             "params": 11765788416,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_12b_text/1",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_12b_text/2",
     },
     "gemma3_27b_text": {
         "metadata": {
@@ -77,7 +77,7 @@ backbone_presets = {
             "params": 27009002240,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_27b_text/1",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_27b_text/3",
     },
     "gemma3_instruct_27b_text": {
         "metadata": {
@@ -88,6 +88,72 @@ backbone_presets = {
             "params": 27009002240,
             "path": "gemma3",
         },
-        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_27b_text/1",
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_27b_text/2",
+    },
+    "gemma3_4b": {
+        "metadata": {
+            "description": (
+                "4 billion parameter, 34-layer, vision+text pretrained "
+                "Gemma3 model."
+            ),
+            "params": 4299915632,
+            "path": "gemma3",
+        },
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_4b/1",
+    },
+    "gemma3_instruct_4b": {
+        "metadata": {
+            "description": (
+                "4 billion parameter, 34-layer, vision+text instruction-tuned "
+                "Gemma3 model."
+            ),
+            "params": 4299915632,
+            "path": "gemma3",
+        },
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_4b/1",
+    },
+    "gemma3_12b": {
+        "metadata": {
+            "description": (
+                "12 billion parameter, 48-layer, vision+text pretrained "
+                "Gemma3 model."
+            ),
+            "params": 12187079280,
+            "path": "gemma3",
+        },
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_12b/1",
+    },
+    "gemma3_instruct_12b": {
+        "metadata": {
+            "description": (
+                "12 billion parameter, 48-layer, vision+text instruction-tuned "
+                "Gemma3 model."
+            ),
+            "params": 12187079280,
+            "path": "gemma3",
+        },
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_12b/1",
+    },
+    "gemma3_27b": {
+        "metadata": {
+            "description": (
+                "27 billion parameter, 62-layer, vision+text pretrained "
+                "Gemma3 model."
+            ),
+            "params": 27432062576,
+            "path": "gemma3",
+        },
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_27b/1",
+    },
+    "gemma3_instruct_27b": {
+        "metadata": {
+            "description": (
+                "27 billion parameter, 62-layer, vision+text instruction-tuned "
+                "Gemma3 model."
+            ),
+            "params": 27432062576,
+            "path": "gemma3",
+        },
+        "kaggle_handle": "kaggle://keras/gemma3/keras/gemma3_instruct_27b/1",
     },
 }

keras_hub/src/models/gemma3/gemma3_tokenizer.py CHANGED Viewed

@@ -4,6 +4,10 @@ from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
     SentencePieceTokenizer,
 )
+START_OF_IMAGE_TOKEN = "<start_of_image>"
+IMAGE_PLACEHOLDER_TOKEN = "<img>"
+END_OF_IMAGE_TOKEN = "<end_of_image>"
 @keras_hub_export(
     [
@@ -84,4 +88,9 @@ class Gemma3Tokenizer(SentencePieceTokenizer):
         # Image placeholder token.
         self._add_special_token("<img>", "image_placeholder")
+        #  Some tokens which are used in the preprocessor. We need to keep them
+        # here so that the preprocessor works with `tf.data`.
+        self._add_special_token("<start_of_image>", "start_of_image_token")
+        self._add_special_token("<end_of_image>", "end_of_image_token")
         super().__init__(proto=proto, **kwargs)

keras-hub-nightly 0.20.0.dev202504030357__py3-none-any.whl → 0.21.0.dev202504050402__py3-none-any.whl

keras-hub-nightly 0.20.0.dev202504030357py3-none-any.whl → 0.21.0.dev202504050402py3-none-any.whl