PyPI - keras-hub-nightly - Versions diffs - 0.20.0.dev202504030357__py3-none-any.whl → 0.21.0.dev202504050402__py3-none-any.whl - Mend

keras-hub-nightly 0.20.0.dev202504030357py3-none-any.whl → 0.21.0.dev202504050402py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

keras_hub/api/models/__init__.py CHANGED Viewed

@@ -183,6 +183,9 @@ from keras_hub.src.models.gemma3.gemma3_causal_lm_preprocessor import (
     Gemma3CausalLMPreprocessor,
 )
 from keras_hub.src.models.gemma3.gemma3_tokenizer import Gemma3Tokenizer
+from keras_hub.src.models.gemma3.gemma3_vision_encoder import (
+    Gemma3VisionEncoder,
+)
 from keras_hub.src.models.gpt2.gpt2_backbone import GPT2Backbone
 from keras_hub.src.models.gpt2.gpt2_causal_lm import GPT2CausalLM
 from keras_hub.src.models.gpt2.gpt2_causal_lm_preprocessor import (
@@ -273,24 +276,6 @@ from keras_hub.src.models.phi3.phi3_causal_lm_preprocessor import (
 )
 from keras_hub.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
 from keras_hub.src.models.preprocessor import Preprocessor
-from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone
-from keras_hub.src.models.qwen.qwen_backbone import (
-    QwenBackbone as Qwen2Backbone,
-)
-from keras_hub.src.models.qwen.qwen_causal_lm import QwenCausalLM
-from keras_hub.src.models.qwen.qwen_causal_lm import (
-    QwenCausalLM as Qwen2CausalLM,
-)
-from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
-    QwenCausalLMPreprocessor,
-)
-from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
-    QwenCausalLMPreprocessor as Qwen2CausalLMPreprocessor,
-)
-from keras_hub.src.models.qwen.qwen_tokenizer import QwenTokenizer
-from keras_hub.src.models.qwen.qwen_tokenizer import (
-    QwenTokenizer as Qwen2Tokenizer,
-)
 from keras_hub.src.models.resnet.resnet_backbone import ResNetBackbone
 from keras_hub.src.models.resnet.resnet_image_classifier import (
     ResNetImageClassifier,
@@ -324,7 +309,7 @@ from keras_hub.src.models.roberta.roberta_text_classifier_preprocessor import (
 )
 from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_hub.src.models.roformer_v2.roformer_v2_backbone import (
-    RoformerV2Backbone as RorformerV2Backbone,
+    RoformerV2Backbone,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_masked_lm import (
     RoformerV2MaskedLM,
@@ -333,7 +318,7 @@ from keras_hub.src.models.roformer_v2.roformer_v2_masked_lm_preprocessor import
     RoformerV2MaskedLMPreprocessor,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_text_classifier import (
-    RorformerV2TextClassifier,
+    RoformerV2TextClassifier,
 )
 from keras_hub.src.models.roformer_v2.roformer_v2_text_classifier_preprocessor import (
     RoformerV2TextClassifierPreprocessor,

keras_hub/api/tokenizers/__init__.py CHANGED Viewed

@@ -30,10 +30,6 @@ from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
     PaliGemmaTokenizer,
 )
 from keras_hub.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
-from keras_hub.src.models.qwen.qwen_tokenizer import QwenTokenizer
-from keras_hub.src.models.qwen.qwen_tokenizer import (
-    QwenTokenizer as Qwen2Tokenizer,
-)
 from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_hub.src.models.roformer_v2.roformer_v2_tokenizer import (
     RoformerV2Tokenizer,

keras_hub/src/layers/preprocessing/image_converter.py CHANGED Viewed

@@ -16,6 +16,7 @@ from keras_hub.src.utils.preset_utils import get_preset_loader
 from keras_hub.src.utils.preset_utils import get_preset_saver
 from keras_hub.src.utils.python_utils import classproperty
 from keras_hub.src.utils.tensor_utils import check_bounding_box_support
+from keras_hub.src.utils.tensor_utils import in_tf_function
 from keras_hub.src.utils.tensor_utils import preprocessing_function
@@ -270,9 +271,15 @@ class ImageConverter(PreprocessingLayer):
         else:
             x = inputs
         if self.scale is not None:
-            x = x * self._expand_non_channel_dims(self.scale, x)
+            # If we are scaling always cast to the compute dtype. We can't
+            # leave things as an int type if we are scaling to [0, 1].
+            scale = self._expand_non_channel_dims(self.scale, x)
+            x, scale = self._convert_types(x, scale, self.compute_dtype)
+            x = x * scale
         if self.offset is not None:
-            x = x + self._expand_non_channel_dims(self.offset, x)
+            offset = self._expand_non_channel_dims(self.offset, x)
+            x, offset = self._convert_types(x, offset, x.dtype)
+            x = x + offset
         if isinstance(inputs, dict):
             inputs["images"] = x
         else:
@@ -280,26 +287,29 @@ class ImageConverter(PreprocessingLayer):
         return inputs
     def _expand_non_channel_dims(self, value, inputs):
-        input_dtype = keras.backend.standardize_dtype(inputs.dtype)
+        """Expand non channel dims so value is broadcastable with inputs."""
         unbatched = len(ops.shape(inputs)) == 3
         channels_first = self.data_format == "channels_first"
         if unbatched:
             broadcast_dims = (1, 2) if channels_first else (0, 1)
         else:
             broadcast_dims = (0, 2, 3) if channels_first else (0, 1, 2)
-        # If inputs are not a tensor type, return a numpy array.
-        # This might happen when running under tf.data.
-        if ops.is_tensor(inputs):
-            # preprocessing decorator moves tensors to cpu in torch backend and
-            # processed on CPU, and then converted back to the appropriate
-            # device (potentially GPU) after preprocessing.
-            if keras.backend.backend() == "torch" and self.image_size is None:
-                return ops.expand_dims(value, broadcast_dims).cpu()
-            expanded = ops.expand_dims(value, broadcast_dims)
-            return ops.cast(expanded, input_dtype)
-        else:
-            return np.expand_dims(value, broadcast_dims).astype(input_dtype)
+        # An numpy value will work backend native ops or with tf.data.
+        return np.expand_dims(value, broadcast_dims)
+    def _convert_types(self, x, y, dtype):
+        """Make sure x and y have the same dtype and are on ths same device."""
+        if in_tf_function():
+            # This could happen on any backend if we are running in tf.data.
+            import tensorflow as tf
+            return tf.cast(x, dtype), tf.cast(y, dtype)
+        x = ops.cast(x, dtype)
+        y = ops.cast(y, dtype)
+        if keras.backend.backend() == "torch":
+            # Place on the same device as x (the image).
+            y = y.to(x.device)
+        return x, y
     def get_config(self):
         config = super().get_config()

keras_hub/src/models/gemma3/gemma3_attention.py CHANGED Viewed

@@ -8,19 +8,28 @@ from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.models.gemma.rms_normalization import RMSNormalization
 from keras_hub.src.utils.keras_utils import clone_initializer
 from keras_hub.src.utils.keras_utils import fused_attention_op_available
+from keras_hub.src.utils.keras_utils import gpu_supports_fused_attention_op
+from keras_hub.src.utils.keras_utils import running_on_gpu
 from keras_hub.src.utils.keras_utils import running_on_tpu
 class CachedGemma3Attention(keras.layers.Layer):
     """A cached grouped query attention layer for Gemma3.
-    This is different from Gemma and Gemma2 in several ways:
+    This is the same as the attention layer used for Gemma and Gemma2. It
+    exposes a few additional args:
-    - `use_query_key_norm`: Applies RMS Norm on query, key.
-    - `rope_wavelength`: RoPE wavelength differs from local to global attention
-      layers.
-    - `rope_scaling_factor`: RoPE scaling factor differs from local to global
-      attention layers.
+    `use_query_key_norm`: bool. If True, apply RMS normalization on query
+        and key. For Gemma3, this is True.
+    `rope_wavelength`: float. Configurable value for RoPE wavelength. Gemma3
+        uses 10K for local attention layers and 1M for global attention layers.
+    `gate_dim_reduction`: int. In the gating layers, the output dimension is
+        `intermediate_dim // gate_dim_reduction`. For Gemma and Gemma2, this
+        value is 2. For Gemma3, it is 1.
+    Moreover, the call() method takes in a `cache_update_mask` so as to make
+    sure that the key-value cache is updated only for the non-prompt tokens
+    during generation.
     """
     def __init__(
@@ -139,17 +148,22 @@ class CachedGemma3Attention(keras.layers.Layer):
         x = self.rope_layer(x, start_index=start_index)
         return x
-    def _can_use_flash_attention(self):
+    def _use_fused_attention_op(self):
         if not fused_attention_op_available():
             return False
         if self.dropout > 0.0:
             return False
-        if self.logit_soft_cap is None:
-            return True
-        sig = inspect.signature(ops.dot_product_attention)
-        # We can currently only run soft capped attention for keras >= 3.10
-        # and only on TPU.
-        return running_on_tpu() and "attn_logits_soft_cap" in sig.parameters
+        if running_on_gpu():
+            # GPU never supports softcap in the fused op.
+            if self.logit_soft_cap is not None:
+                return False
+            return gpu_supports_fused_attention_op()
+        elif running_on_tpu():
+            # TPU supports softcap with on keras >= 3.10.
+            sig = inspect.signature(ops.dot_product_attention)
+            return "attn_logits_soft_cap" in sig.parameters
+        else:
+            return False
     def _compute_attention(
         self,
@@ -166,7 +180,14 @@ class CachedGemma3Attention(keras.layers.Layer):
             query_normalization = 1 / np.sqrt(
                 self.hidden_dim // self.num_query_heads
             )
-        if self._can_use_flash_attention():
+        if self.use_sliding_window_attention and attention_mask is not None:
+            attention_mask = self._mask_sliding_window(
+                attention_mask,
+                cache_update_index=cache_update_index,
+            )
+        if self._use_fused_attention_op():
             if attention_mask is not None:
                 attention_mask = ops.expand_dims(attention_mask, axis=1)
                 attention_mask = ops.cast(attention_mask, dtype="bool")
@@ -205,13 +226,8 @@ class CachedGemma3Attention(keras.layers.Layer):
                 ops.tanh(attention_logits), self.logit_soft_cap
             )
-        if self.use_sliding_window_attention:
-            attention_mask = self._mask_sliding_window(
-                attention_mask,
-                cache_update_index=cache_update_index,
-            )
-        attention_mask = attention_mask[:, None, None, :, :]
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, None, None, :, :]
         orig_dtype = attention_logits.dtype
         attention_softmax = self.softmax(attention_logits, mask=attention_mask)
         attention_softmax = ops.cast(attention_softmax, orig_dtype)
@@ -256,6 +272,7 @@ class CachedGemma3Attention(keras.layers.Layer):
         attention_mask=None,
         cache=None,
         cache_update_index=0,
+        cache_update_mask=None,
         training=False,
     ):
         query = self.query_dense(x)
@@ -275,7 +292,43 @@ class CachedGemma3Attention(keras.layers.Layer):
             key_update = self._apply_rope(key_update, cache_update_index)
             value_update = self.value_dense(x)
+            # Update cache. Note that the cache is updated only if the
+            # corresponding `cache_update_mask` value is True. This is to
+            # ensure that we don't update the cache at indices corresponding to
+            # the prompt. For Gemma3, in particular, this is useful because
+            # image tokens have bidirectional attention. During generation,
+            # if we have uneven inputs during generation, we might end up having
+            # causal attention between image tokens, which is incorrect. To
+            # avoid this, bidirectional attention is taken care of during
+            # the prefill step, and during generation, the cache is not updated
+            # for the prompt. The shape of `cache_update_mask` is
+            # `(bsz, seq_len)`, where `seq_len` is 1 when we are generating
+            # token-by-token.
             start = [0, cache_update_index, 0, 0]
+            if cache_update_mask is not None:
+                cache_update_mask = ops.expand_dims(
+                    ops.expand_dims(cache_update_mask, axis=-1),
+                    axis=-1,
+                )
+                key_original = ops.slice(
+                    key_cache, start, ops.shape(key_update)
+                )
+                value_original = ops.slice(
+                    value_cache, start, ops.shape(value_update)
+                )
+                key_update = ops.where(
+                    cache_update_mask,
+                    key_update,
+                    key_original,
+                )
+                value_update = ops.where(
+                    cache_update_mask,
+                    value_update,
+                    value_original,
+                )
             key = ops.slice_update(key_cache, start, key_update)
             value = ops.slice_update(value_cache, start, value_update)
             cache = ops.stack((key, value), axis=1)

keras_hub/src/models/gemma3/gemma3_backbone.py CHANGED Viewed

@@ -19,13 +19,10 @@ class Gemma3Backbone(Backbone):
     This backbone implements the Gemma3 model architecture. Gemma3 is a
     vision-language model (image-text in, text out). The text input is encoded
-    using an embedding layer; images are encoded using a vision transformer.
-    After encoding these two modalities, the image embeddings are placed in the
-    correct position in the text embedding sequence. The mixed sequence of
-    embeddings is then passed through transformer decoder layers.
-    Currently, this model supports only the `vision_encoder = None` case, i.e.,
-    working only with text.
+    using an embedding layer; images are encoded using a vision transformer
+    (ViT). After encoding these two modalities, the image embeddings are placed
+    in the correct position in the text embedding sequence. The mixed sequence
+    of embeddings is then passed through transformer decoder layers.
     For a higher-level object for text-generation, see
     `keras_hub.models.Gemma3CausalLM`.
@@ -66,8 +63,9 @@ class Gemma3Backbone(Backbone):
           window attention. Defaults to `False`.
         sliding_window_size: int. Size of the sliding local window. Defaults to
             `4096`.
-        vision_encoder: `keras.Model` or `keras.layers.Layer` instance. `call()`
-            takes in images and returns corresponding sequence of embeddings.
+        vision_encoder: A `Gemma3VisionEncoder` instance. `call()`
+            takes in images and returns corresponding sequence of embeddings. If
+            `None`, the model is a text-only model.
         layer_norm_epsilon: float. The epsilon value user for every layer norm
             in all transformer blocks. Defaults to `1e-6`.
         dropout: float. Dropout probability for the Transformer decoder blocks.
@@ -75,10 +73,12 @@ class Gemma3Backbone(Backbone):
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
-            be done a float32 precision regardless of dtype.
+            be done in float32 precision regardless of dtype. Defaults to
+            `bfloat16`.
     Example:
     ```python
+    # === Language Gemma3 model ===
     input_data = {}
     input_data["token_ids"] = np.ones(shape=(1, 300), dtype="int32")
     input_data["padding_mask"] = (
@@ -86,32 +86,90 @@ class Gemma3Backbone(Backbone):
         .astype(bool)
     )
+    # Pretrained Gemma3 decoder.
+    model = keras_hub.models.Gemma3Backbone.from_preset(
+        "gemma3_instruct_4b_text"
+    )
+    model(input_data)
+    # Randomly initialized Gemma3 decoder with a custom config.
+    model = keras_hub.models.Gemma3Backbone(
+        vocabulary_size=262144,
+        image_size=896,
+        num_layers=34,
+        num_query_heads=8,
+        num_key_value_heads=4,
+        hidden_dim=2560,
+        intermediate_dim=10240,
+        head_dim=256,
+        query_head_dim_normalize=True,
+        use_post_ffw_norm=True,
+        use_post_attention_norm=True,
+        final_logit_soft_cap=None,
+        attention_logit_soft_cap=None,
+        sliding_window_size=1024,
+        use_sliding_window_attention=True,
+        vision_encoder=None,
+        layer_norm_epsilon=1e-06,
+        dtype="bfloat16",
+    )
+    model(input_data)
+    # === Vision + Language Gemma3 model ===
+    input_data = {}
+    input_data["images"] = np.ones(shape=(1, 1, 896, 896, 3))
+    input_data["token_ids"] = np.ones(shape=(1, 300), dtype="int32")
+    # images after the text part of the sequence.
+    input_data["vision_mask"] = np.expand_dims(
+        np.array([0] * 30 + [1] * 256 + [0] * 14),
+        axis=0,
+    ).astype(bool)
+    input_data["vision_indices"] = (
+        np.expand_dims(np.arange(30, 286), axis=0)
+    )
+    input_data["padding_mask"] = (
+        np.expand_dims(np.array([1] * 286 + [0] * (300 - 286)), axis=0)
+        .astype(bool)
+    )
     # Pretrained Gemma3 decoder.
     model = keras_hub.models.Gemma3Backbone.from_preset("gemma3_instruct_4b")
     model(input_data)
-    config = {
-        'vocabulary_size': 262144,
-        'image_size': 896,
-        'num_layers': 34,
-        'num_query_heads': 8,
-        'num_key_value_heads': 4,
-        'hidden_dim': 2560,
-        'intermediate_dim': 10240,
-        'head_dim': 256,
-        'query_head_dim_normalize': True,
-        'use_post_ffw_norm': True,
-        'use_post_attention_norm': True,
-        'final_logit_soft_cap': None,
-        'attention_logit_soft_cap': None,
-        'sliding_window_size': 1024,
-        'use_sliding_window_attention': True,
-        'vision_encoder': None,
-        'layer_norm_epsilon': 1e-06,
-        dtype: "bfloat16",
-    }
-    model = keras_hub.models.Gemma3Backbone(**config)
+    # Randomly initialized Gemma3 decoder with a custom config.
+    vision_encoder = Gemma3VisionEncoder(
+        image_size=896,
+        patch_size=14,
+        num_heads=16,
+        hidden_dim=1152,
+        num_layers=27,
+        intermediate_dim=4304,
+        output_dim=2560,
+        pool_size=4,
+        layer_norm_epsilon=1e-6,
+        dtype="float32",
+    )
+    model = keras_hub.models.Gemma3Backbone(
+        vocabulary_size=262144,
+        image_size=896,
+        num_layers=34,
+        num_query_heads=8,
+        num_key_value_heads=4,
+        hidden_dim=2560,
+        intermediate_dim=10240,
+        head_dim=256,
+        query_head_dim_normalize=True,
+        use_post_ffw_norm=True,
+        use_post_attention_norm=True,
+        final_logit_soft_cap=None,
+        attention_logit_soft_cap=None,
+        sliding_window_size=1024,
+        use_sliding_window_attention=True,
+        vision_encoder=vision_encoder,
+        layer_norm_epsilon=1e-06,
+        dtype="bfloat16"
+    )
     model(input_data)
     ```
     """
@@ -134,18 +192,14 @@ class Gemma3Backbone(Backbone):
         final_logit_soft_cap=None,
         use_sliding_window_attention=False,
         sliding_window_size=1024,
+        local_rope_scaling_factor=1.0,
+        global_rope_scaling_factor=1.0,
         vision_encoder=None,
         layer_norm_epsilon=1e-6,
         dropout=0,
         dtype=None,
         **kwargs,
     ):
-        if vision_encoder is not None:
-            raise ValueError(
-                "Currently, only the text version of the Gemma3 model is "
-                "supported."
-            )
         # === Layers ===
         self.token_embedding = ReversibleEmbedding(
             input_dim=vocabulary_size,
@@ -176,7 +230,11 @@ class Gemma3Backbone(Backbone):
             # 5 local, 1 global
             sliding_window = use_sliding_window_attention and (i % 6 < 5)
             rope_wavelength = 10_000.0 if sliding_window else 1_000_000.0
-            rope_scaling_factor = 1.0 if sliding_window else 8.0
+            rope_scaling_factor = (
+                local_rope_scaling_factor
+                if sliding_window
+                else global_rope_scaling_factor
+            )
             layer = Gemma3DecoderBlock(
                 hidden_dim=hidden_dim,
                 intermediate_dim=intermediate_dim,
@@ -215,10 +273,11 @@ class Gemma3Backbone(Backbone):
             vision_indices_input = keras.Input(
                 shape=(None,), dtype="int32", name="vision_indices"
             )
-            # TODO: Consider removing `text_mask_input` and using
-            # `vision_indices_input` to infer it directly.
-            text_mask_input = keras.Input(
-                shape=(None,), dtype="int32", name="text_mask"
+            # Truth be told, this is redundant, and we can infer this from
+            # `vision_indices_input`. But it is easier to return this from
+            # the preprocessor than to compute it here.
+            vision_mask_input = keras.Input(
+                shape=(None,), dtype="int32", name="vision_mask"
             )
         token_id_input = keras.Input(
@@ -239,7 +298,7 @@ class Gemma3Backbone(Backbone):
         if not text_only_model:
             img_embeddings = self.vision_encoder(image_input)
-            ## == Interleaving text and images ==
+            # == Interleaving text and images ==
             # Place image embeddings in the right position in
             # `text_embeddings`.
             x = self.interleave_embeddings(
@@ -255,7 +314,7 @@ class Gemma3Backbone(Backbone):
             x = transformer_layer(
                 x,
                 padding_mask=padding_mask_input,
-                text_mask=None if text_only_model else text_mask_input,
+                vision_mask=None if text_only_model else vision_mask_input,
             )
         sequence_output = self.layer_norm(x)
@@ -268,7 +327,7 @@ class Gemma3Backbone(Backbone):
                 {
                     "images": image_input,
                     "vision_indices": vision_indices_input,
-                    "text_mask": text_mask_input,
+                    "vision_mask": vision_mask_input,
                 }
             )
@@ -296,6 +355,8 @@ class Gemma3Backbone(Backbone):
         self.final_logit_soft_cap = final_logit_soft_cap
         self.use_sliding_window_attention = use_sliding_window_attention
         self.sliding_window_size = sliding_window_size
+        self.local_rope_scaling_factor = local_rope_scaling_factor
+        self.global_rope_scaling_factor = global_rope_scaling_factor
         self.layer_norm_epsilon = layer_norm_epsilon
         self.dropout = dropout
@@ -330,6 +391,8 @@ class Gemma3Backbone(Backbone):
                     self.use_sliding_window_attention
                 ),
                 "sliding_window_size": self.sliding_window_size,
+                "local_rope_scaling_factor": self.local_rope_scaling_factor,
+                "global_rope_scaling_factor": self.global_rope_scaling_factor,
                 "vision_encoder": None
                 if self.vision_encoder is None
                 else keras.layers.serialize(self.vision_encoder),
@@ -339,6 +402,14 @@ class Gemma3Backbone(Backbone):
         )
         return config
+    def get_lora_target_names(self):
+        target_names = super().get_lora_target_names()
+        # Add these for `Gemma3VITAttention`.
+        if not self.text_only_model:
+            target_names += ["query_proj", "value_proj"]
+        return target_names
     @classmethod
     def from_config(cls, config):
         config.update(

keras-hub-nightly 0.20.0.dev202504030357__py3-none-any.whl → 0.21.0.dev202504050402__py3-none-any.whl

keras-hub-nightly 0.20.0.dev202504030357py3-none-any.whl → 0.21.0.dev202504050402py3-none-any.whl