PyPI - keras-hub - Versions diffs - 0.25.0.dev0__py3-none-any.whl → 0.26.0.dev0__py3-none-any.whl - Mend

keras-hub 0.25.0.dev0py3-none-any.whl → 0.26.0.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

keras_hub/layers/__init__.py +21 -0
keras_hub/models/__init__.py +27 -0
keras_hub/src/layers/modeling/non_max_supression.py +5 -2
keras_hub/src/layers/modeling/reversible_embedding.py +2 -275
keras_hub/src/layers/modeling/token_and_position_embedding.py +6 -6
keras_hub/src/layers/modeling/transformer_layer_utils.py +9 -9
keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +3 -1
keras_hub/src/layers/preprocessing/multi_segment_packer.py +3 -1
keras_hub/src/models/albert/albert_backbone.py +1 -3
keras_hub/src/models/backbone.py +3 -0
keras_hub/src/models/bart/bart_backbone.py +1 -3
keras_hub/src/models/bert/bert_backbone.py +2 -4
keras_hub/src/models/bloom/bloom_backbone.py +1 -3
keras_hub/src/models/causal_lm.py +2 -2
keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +1 -3
keras_hub/src/models/edrec/edrec_backbone.py +147 -0
keras_hub/src/models/edrec/edrec_layers.py +434 -0
keras_hub/src/models/edrec/edrec_seq2seq_lm.py +273 -0
keras_hub/src/models/electra/electra_backbone.py +1 -3
keras_hub/src/models/f_net/f_net_backbone.py +1 -3
keras_hub/src/models/falcon/falcon_backbone.py +1 -3
keras_hub/src/models/flux/flux_layers.py +3 -3
keras_hub/src/models/flux/flux_maths.py +29 -15
keras_hub/src/models/gemma/gemma_backbone.py +1 -3
keras_hub/src/models/gemma/gemma_causal_lm.py +1 -1
keras_hub/src/models/gemma3/gemma3_attention.py +1 -1
keras_hub/src/models/gemma3/gemma3_backbone.py +70 -8
keras_hub/src/models/gemma3/gemma3_causal_lm.py +16 -1
keras_hub/src/models/gemma3/gemma3_decoder_block.py +23 -3
keras_hub/src/models/gemma3/{gemma3_interleave_embeddings.py → gemma3_layers.py} +101 -0
keras_hub/src/models/gemma3/gemma3_presets.py +79 -7
keras_hub/src/models/gemma3/gemma3_vision_encoder.py +1 -1
keras_hub/src/models/gpt2/gpt2_backbone.py +1 -3
keras_hub/src/models/gpt2/gpt2_causal_lm.py +1 -1
keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +1 -3
keras_hub/src/models/gpt_oss/gpt_oss_backbone.py +1 -3
keras_hub/src/models/llama/llama_backbone.py +1 -3
keras_hub/src/models/masked_lm.py +1 -1
keras_hub/src/models/mistral/mistral_backbone.py +1 -3
keras_hub/src/models/mixtral/mixtral_backbone.py +1 -3
keras_hub/src/models/moonshine/moonshine_backbone.py +1 -3
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +1 -3
keras_hub/src/models/parseq/parseq_tokenizer.py +3 -1
keras_hub/src/models/phi3/phi3_backbone.py +1 -3
keras_hub/src/models/qwen/qwen_backbone.py +1 -3
keras_hub/src/models/qwen/qwen_presets.py +209 -0
keras_hub/src/models/qwen3/qwen3_backbone.py +1 -3
keras_hub/src/models/qwen3_moe/qwen3_moe_backbone.py +1 -3
keras_hub/src/models/qwen3_moe/qwen3_moe_presets.py +15 -0
keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +1 -3
keras_hub/src/models/roformer_v2/roformer_v2_backbone.py +1 -3
keras_hub/src/models/rqvae/__init__.py +5 -0
keras_hub/src/models/rqvae/rqvae_backbone.py +167 -0
keras_hub/src/models/rqvae/rqvae_layers.py +335 -0
keras_hub/src/models/rwkv7/__init__.py +5 -0
keras_hub/src/models/rwkv7/rwkv7_backbone.py +180 -0
keras_hub/src/models/rwkv7/rwkv7_causal_lm.py +259 -0
keras_hub/src/models/rwkv7/rwkv7_causal_lm_preprocessor.py +214 -0
keras_hub/src/models/rwkv7/rwkv7_layer.py +724 -0
keras_hub/src/models/rwkv7/rwkv7_presets.py +26 -0
keras_hub/src/models/rwkv7/rwkv7_tokenizer.py +495 -0
keras_hub/src/models/sam/sam_backbone.py +5 -1
keras_hub/src/models/sam/sam_prompt_encoder.py +1 -1
keras_hub/src/models/sam3/__init__.py +7 -0
keras_hub/src/models/sam3/roi_align.py +222 -0
keras_hub/src/models/sam3/sam3_detr_decoder.py +641 -0
keras_hub/src/models/sam3/sam3_detr_encoder.py +293 -0
keras_hub/src/models/sam3/sam3_dot_product_scoring.py +120 -0
keras_hub/src/models/sam3/sam3_geometry_encoder.py +517 -0
keras_hub/src/models/sam3/sam3_image_converter.py +10 -0
keras_hub/src/models/sam3/sam3_layers.py +814 -0
keras_hub/src/models/sam3/sam3_mask_decoder.py +374 -0
keras_hub/src/models/sam3/sam3_pc_backbone.py +306 -0
keras_hub/src/models/sam3/sam3_pc_image_segmenter.py +282 -0
keras_hub/src/models/sam3/sam3_pc_image_segmenter_preprocessor.py +336 -0
keras_hub/src/models/sam3/sam3_presets.py +16 -0
keras_hub/src/models/sam3/sam3_text_encoder.py +212 -0
keras_hub/src/models/sam3/sam3_tokenizer.py +65 -0
keras_hub/src/models/sam3/sam3_utils.py +134 -0
keras_hub/src/models/sam3/sam3_vision_encoder.py +738 -0
keras_hub/src/models/segformer/segformer_backbone.py +6 -6
keras_hub/src/models/siglip/siglip_layers.py +1 -3
keras_hub/src/models/smollm3/smollm3_backbone.py +1 -3
keras_hub/src/models/stable_diffusion_3/t5_encoder.py +1 -3
keras_hub/src/models/t5/t5_backbone.py +1 -3
keras_hub/src/models/t5gemma/t5gemma_backbone.py +1 -3
keras_hub/src/models/task.py +1 -1
keras_hub/src/tests/test_case.py +394 -3
keras_hub/src/tokenizers/byte_pair_tokenizer.py +33 -2
keras_hub/src/tokenizers/byte_tokenizer.py +3 -1
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +15 -1
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +3 -1
keras_hub/src/tokenizers/word_piece_tokenizer.py +15 -1
keras_hub/src/utils/preset_utils.py +1 -1
keras_hub/src/utils/tensor_utils.py +12 -0
keras_hub/src/utils/transformers/convert_gemma3.py +68 -22
keras_hub/src/utils/transformers/convert_qwen3_moe.py +4 -1
keras_hub/src/utils/transformers/convert_sam3.py +472 -0
keras_hub/src/utils/transformers/export/gemma3.py +196 -0
keras_hub/src/utils/transformers/export/hf_exporter.py +86 -25
keras_hub/src/utils/transformers/export/qwen.py +136 -0
keras_hub/src/utils/transformers/preset_loader.py +15 -1
keras_hub/src/version.py +1 -1
keras_hub/tokenizers/__init__.py +6 -0
{keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/METADATA +6 -13
{keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/RECORD +108 -76
{keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/WHEEL +1 -1
keras_hub/src/models/gemma3/rms_normalization.py +0 -26
{keras_hub-0.25.0.dev0.dist-info → keras_hub-0.26.0.dev0.dist-info}/top_level.txt +0 -0

keras_hub/src/models/edrec/edrec_seq2seq_lm.py ADDED Viewed

@@ -0,0 +1,273 @@
+import keras
+from keras import ops
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.edrec.edrec_backbone import EdRecBackbone
+from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
+from keras_hub.src.utils.tensor_utils import any_equal
+@keras_hub_export("keras_hub.models.EdRecSeq2SeqLM")
+class EdRecSeq2SeqLM(Seq2SeqLM):
+    """EdRec Seq2SeqLM.
+    Args:
+        backbone: A `keras_hub.models.EdRecBackbone` instance.
+        preprocessor: Optional preprocessor.
+    """
+    backbone_cls = EdRecBackbone
+    preprocessor_cls = None
+    def __init__(
+        self,
+        backbone,
+        preprocessor=None,
+        **kwargs,
+    ):
+        # === Layers ===
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+        # LM Head
+        self.lm_head = keras.layers.Dense(
+            backbone.vocab_size, use_bias=False, name="lm_head"
+        )
+        # === Functional Model ===
+        encoder_token_ids = keras.Input(
+            shape=(None,), dtype="int32", name="encoder_token_ids"
+        )
+        decoder_token_ids = keras.Input(
+            shape=(None,), dtype="int32", name="decoder_token_ids"
+        )
+        encoder_padding_mask = keras.Input(
+            shape=(None,), dtype="bool", name="encoder_padding_mask"
+        )
+        decoder_padding_mask = keras.Input(
+            shape=(None,), dtype="bool", name="decoder_padding_mask"
+        )
+        inputs = {
+            "encoder_token_ids": encoder_token_ids,
+            "decoder_token_ids": decoder_token_ids,
+            "encoder_padding_mask": encoder_padding_mask,
+            "decoder_padding_mask": decoder_padding_mask,
+        }
+        backbone_outputs = backbone(inputs)
+        # The backbone returns a dict; we likely want the decoder output for the
+        # LM head if both are present, or just use what makes sense.
+        # For a Seq2Seq model training, we usually consume the decoder output.
+        outputs = self.lm_head(backbone_outputs["decoder_sequence_output"])
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
+    def call_decoder_with_cache(
+        self,
+        encoder_hidden_states,
+        encoder_padding_mask,
+        decoder_token_ids,
+        decoder_padding_mask=None,
+        self_attention_cache=None,
+        self_attention_cache_update_index=None,
+        cross_attention_cache=None,
+        cross_attention_cache_update_index=None,
+    ):
+        x = self.backbone.embedding(decoder_token_ids)
+        if decoder_padding_mask is None:
+            decoder_padding_mask = ops.not_equal(decoder_token_ids, 0)
+        self_attention_caches = []
+        cross_attention_caches = []
+        for i, layer in enumerate(self.backbone.decoder_layers):
+            current_self_cache = (
+                self_attention_cache[:, i, ...]
+                if self_attention_cache is not None
+                else None
+            )
+            current_cross_cache = (
+                cross_attention_cache[:, i, ...]
+                if cross_attention_cache is not None
+                else None
+            )
+            x, next_self, next_cross = layer(
+                x,
+                encoder_outputs=encoder_hidden_states,
+                decoder_padding_mask=decoder_padding_mask,
+                encoder_padding_mask=encoder_padding_mask,
+                self_attention_cache=current_self_cache,
+                self_attention_cache_update_index=self_attention_cache_update_index,
+                cross_attention_cache=current_cross_cache,
+                cross_attention_cache_update_index=cross_attention_cache_update_index,
+            )
+            if next_self is not None:
+                self_attention_caches.append(next_self)
+            if next_cross is not None:
+                cross_attention_caches.append(next_cross)
+        if self_attention_cache_update_index is not None:
+            self_attention_cache = ops.stack(self_attention_caches, axis=1)
+        if cross_attention_cache_update_index is not None:
+            cross_attention_cache = ops.stack(cross_attention_caches, axis=1)
+        hidden_states = x
+        logits = self.lm_head(x)
+        return (
+            logits,
+            hidden_states,
+            self_attention_cache,
+            cross_attention_cache,
+        )
+    def call_encoder(self, token_ids, padding_mask):
+        x = self.backbone.embedding(token_ids)
+        for layer in self.backbone.encoder_layers:
+            x = layer(x, padding_mask=padding_mask)
+        return x
+    def _initialize_cache(self, encoder_token_ids, decoder_token_ids):
+        batch_size = ops.shape(encoder_token_ids)[0]
+        encoder_max_length = ops.shape(encoder_token_ids)[1]
+        decoder_max_length = ops.shape(decoder_token_ids)[1]
+        num_layers = self.backbone.num_layers_dec
+        num_heads = self.backbone.num_heads
+        head_dim = self.backbone.hidden_dim // num_heads
+        shape = [
+            batch_size,
+            num_layers,
+            2,
+            decoder_max_length,
+            num_heads,
+            head_dim,
+        ]
+        self_attention_cache = ops.zeros(shape, dtype=self.compute_dtype)
+        shape[3] = encoder_max_length
+        cross_attention_cache = ops.zeros(shape, dtype=self.compute_dtype)
+        return self_attention_cache, cross_attention_cache
+    def generate_step(self, inputs, stop_token_ids=None):
+        encoder_token_ids = inputs["encoder_token_ids"]
+        encoder_padding_mask = inputs["encoder_padding_mask"]
+        decoder_token_ids = inputs.get("decoder_token_ids")
+        if decoder_token_ids is None:
+            batch_size = ops.shape(encoder_token_ids)[0]
+            decoder_token_ids = ops.zeros((batch_size, 1), dtype="int32")
+        decoder_padding_mask = inputs.get("decoder_padding_mask")
+        if decoder_padding_mask is None:
+            decoder_padding_mask = ops.ones_like(
+                decoder_token_ids, dtype="bool"
+            )
+        batch_size = ops.shape(encoder_token_ids)[0]
+        encoder_hidden_states = self.call_encoder(
+            encoder_token_ids, encoder_padding_mask
+        )
+        self_attention_cache, cross_attention_cache = self._initialize_cache(
+            encoder_token_ids, decoder_token_ids
+        )
+        row_lengths = ops.sum(ops.cast(decoder_padding_mask, "int32"), axis=-1)
+        start_index = ops.min(row_lengths)
+        # Init cache logic for step 0
+        token_0 = ops.slice(decoder_token_ids, [0, 0], [batch_size, 1])
+        mask_0 = ops.slice(decoder_padding_mask, [0, 0], [batch_size, 1])
+        _, _, s_cache, c_cache = self.call_decoder_with_cache(
+            encoder_hidden_states,
+            encoder_padding_mask,
+            token_0,
+            mask_0,
+            self_attention_cache,
+            0,
+            cross_attention_cache,
+            0,
+        )
+        # We define cache as tuple
+        cache = (s_cache, c_cache)
+        hidden_states = ops.zeros_like(token_0, dtype="float32")
+        def next(prompt, cache, index):
+            s_c, c_c = cache
+            # Handle beam search replication if needed
+            curr_batch = ops.shape(prompt)[0]
+            enc_batch = ops.shape(encoder_hidden_states)[0]
+            enc_states = encoder_hidden_states
+            enc_mask = encoder_padding_mask
+            if curr_batch != enc_batch:
+                repeats = curr_batch // enc_batch
+                enc_states = ops.repeat(enc_states, repeats, axis=0)
+                enc_mask = ops.repeat(enc_mask, repeats, axis=0)
+            cache_index = index - 1
+            num_samples = ops.shape(prompt)[0]
+            prompt_slice = ops.slice(prompt, [0, cache_index], [num_samples, 1])
+            logits, h_states, next_s, next_c = self.call_decoder_with_cache(
+                enc_states,
+                enc_mask,
+                prompt_slice,
+                None,
+                s_c,
+                index - 1,
+                c_c,
+                None,  # Cross cache re-use
+            )
+            # If the backbone returns the full sequence, we only need the last
+            # token.
+            if ops.shape(logits)[1] != 1:
+                logits = ops.take(logits, [cache_index], axis=1)
+                h_states = ops.take(h_states, [cache_index], axis=1)
+            return (
+                ops.squeeze(logits, axis=1),
+                ops.squeeze(h_states, axis=1),
+                (next_s, next_c),
+            )
+        new_tokens = self.sampler(
+            next=next,
+            prompt=decoder_token_ids,
+            cache=cache,
+            index=start_index,
+            mask=decoder_padding_mask,
+            stop_token_ids=stop_token_ids,
+            hidden_states=hidden_states,
+            model=self,
+        )
+        if stop_token_ids is not None:
+            end_locations = any_equal(
+                new_tokens,
+                stop_token_ids,
+                ops.logical_not(decoder_padding_mask),
+            )
+            end_locations = ops.cast(end_locations, "int32")
+            cumsum = ops.cast(ops.cumsum(end_locations, axis=-1), "int32")
+            overflow = cumsum - end_locations
+            decoder_padding_mask = ops.logical_not(ops.cast(overflow, "bool"))
+        else:
+            decoder_padding_mask = ops.ones_like(new_tokens, dtype="bool")
+        return {
+            "decoder_token_ids": new_tokens,
+            "decoder_padding_mask": decoder_padding_mask,
+        }

keras_hub/src/models/electra/electra_backbone.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import keras
+from keras.layers import ReversibleEmbedding
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
-from keras_hub.src.layers.modeling.reversible_embedding import (
-    ReversibleEmbedding,
-)
 from keras_hub.src.layers.modeling.transformer_encoder import TransformerEncoder
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.utils.keras_utils import gelu_approximate

keras_hub/src/models/f_net/f_net_backbone.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import keras
+from keras.layers import ReversibleEmbedding
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.modeling.f_net_encoder import FNetEncoder
 from keras_hub.src.layers.modeling.position_embedding import PositionEmbedding
-from keras_hub.src.layers.modeling.reversible_embedding import (
-    ReversibleEmbedding,
-)
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.utils.keras_utils import gelu_approximate

keras_hub/src/models/falcon/falcon_backbone.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import keras
+from keras.layers import ReversibleEmbedding
 from keras_hub.src.api_export import keras_hub_export
-from keras_hub.src.layers.modeling.reversible_embedding import (
-    ReversibleEmbedding,
-)
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.models.falcon.falcon_transformer_decoder import (
     FalconTransformerDecoder,

keras_hub/src/models/flux/flux_layers.py CHANGED Viewed

@@ -38,7 +38,7 @@ class EmbedND(keras.Model):
         Returns:
             KerasTensor: Positional embeddings of shape
-            (..., concatenated_dim, 1, ...).
+            (..., sum(axes_dim) // 2, 2).
         """
         n_axes = ids.shape[-1]
         emb = ops.concatenate(
@@ -46,10 +46,10 @@ class EmbedND(keras.Model):
                 self.rope(ids[..., i], dim=self.axes_dim[i], theta=self.theta)
                 for i in range(n_axes)
             ],
-            axis=-3,
+            axis=-2,
         )
-        return ops.expand_dims(emb, axis=1)
+        return emb
 class MLPEmbedder(keras.Model):

keras_hub/src/models/flux/flux_maths.py CHANGED Viewed

@@ -56,10 +56,7 @@ class RotaryPositionalEmbedding(keras.layers.Layer):
         scale = ops.arange(0, dim, 2, dtype="float32") / dim
         omega = 1.0 / (theta**scale)
         out = ops.einsum("...n,d->...nd", pos, omega)
-        out = ops.stack(
-            [ops.cos(out), -ops.sin(out), ops.sin(out), ops.cos(out)], axis=-1
-        )
-        out = ops.reshape(out, ops.shape(out)[:-1] + (2, 2))
+        out = ops.stack([ops.cos(out), ops.sin(out)], axis=-1)
         return ops.cast(out, dtype="float32")
@@ -71,26 +68,43 @@ class ApplyRoPE(keras.layers.Layer):
         xq: KerasTensor. The query tensor of shape (..., L, D).
         xk: KerasTensor. The key tensor of shape (..., L, D).
         freqs_cis: KerasTensor. The frequency complex numbers tensor with shape
-            `(..., 2)`.
+            (..., L, D//2, 2).
     Returns:
         tuple[KerasTensor, KerasTensor]: The transformed query and key tensors.
     """
     def call(self, xq, xk, freqs_cis):
-        xq_ = ops.reshape(xq, (*ops.shape(xq)[:-1], -1, 1, 2))
-        xk_ = ops.reshape(xk, (*ops.shape(xk)[:-1], -1, 1, 2))
-        xq_out = (
-            freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        # xq, xk shape (..., num_heads, seq_len, D)
+        # freqs_cis shape (..., seq_len, D//2, 2)
+        # Expand freqs_cis to match num_heads dimension
+        freqs_cis = ops.expand_dims(freqs_cis, axis=-4)
+        # Now freqs_cis shape (..., 1, seq_len, D//2, 2)
+        xq_ = ops.reshape(xq, (*ops.shape(xq)[:-1], -1, 2))
+        xk_ = ops.reshape(xk, (*ops.shape(xk)[:-1], -1, 2))
+        xq_real = xq_[..., 0]
+        xq_imag = xq_[..., 1]
+        xk_real = xk_[..., 0]
+        xk_imag = xk_[..., 1]
+        freqs_cos = freqs_cis[..., 0]
+        freqs_sin = freqs_cis[..., 1]
+        xq_out_real = xq_real * freqs_cos - xq_imag * freqs_sin
+        xq_out_imag = xq_real * freqs_sin + xq_imag * freqs_cos
+        xk_out_real = xk_real * freqs_cos - xk_imag * freqs_sin
+        xk_out_imag = xk_real * freqs_sin + xk_imag * freqs_cos
+        xq_out = ops.reshape(
+            ops.stack([xq_out_real, xq_out_imag], axis=-1), ops.shape(xq)
         )
-        xk_out = (
-            freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        xk_out = ops.reshape(
+            ops.stack([xk_out_real, xk_out_imag], axis=-1), ops.shape(xk)
         )
-        return ops.reshape(xq_out, ops.shape(xq)), ops.reshape(
-            xk_out, ops.shape(xk)
-        )
+        return xq_out, xk_out
 class FluxRoPEAttention(keras.layers.Layer):

keras_hub/src/models/gemma/gemma_backbone.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import keras
 from keras import ops
+from keras.layers import ReversibleEmbedding
 from keras_hub.src.api_export import keras_hub_export
-from keras_hub.src.layers.modeling.reversible_embedding import (
-    ReversibleEmbedding,
-)
 from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.models.gemma.gemma_decoder_block import GemmaDecoderBlock
 from keras_hub.src.models.gemma.rms_normalization import RMSNormalization

keras_hub/src/models/gemma/gemma_causal_lm.py CHANGED Viewed

@@ -433,7 +433,7 @@ class GemmaCausalLM(CausalLM):
         return per_token_loss
     def get_quantization_layer_structure(self, mode):
-        if mode != "gptq":
+        if mode not in ["gptq", "awq"]:
             return None
         # Wrap embedding + scaling

keras_hub/src/models/gemma3/gemma3_attention.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 from keras import ops
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
-from keras_hub.src.models.gemma.rms_normalization import RMSNormalization
+from keras_hub.src.models.gemma3.gemma3_layers import RMSNormalization
 from keras_hub.src.utils.keras_utils import clone_initializer
 from keras_hub.src.utils.keras_utils import fused_attention_op_available
 from keras_hub.src.utils.keras_utils import gpu_supports_fused_attention_op

keras_hub/src/models/gemma3/gemma3_backbone.py CHANGED Viewed

@@ -1,16 +1,14 @@
 import keras
+from keras import layers
 from keras import ops
+from keras.layers import ReversibleEmbedding
 from keras_hub.src.api_export import keras_hub_export
-from keras_hub.src.layers.modeling.reversible_embedding import (
-    ReversibleEmbedding,
-)
 from keras_hub.src.models.backbone import Backbone
-from keras_hub.src.models.gemma.rms_normalization import RMSNormalization
 from keras_hub.src.models.gemma3.gemma3_decoder_block import Gemma3DecoderBlock
-from keras_hub.src.models.gemma3.gemma3_interleave_embeddings import (
-    Gemma3InterleaveEmbeddings,
-)
+from keras_hub.src.models.gemma3.gemma3_layers import Gemma3InterleaveEmbeddings
+from keras_hub.src.models.gemma3.gemma3_layers import Gemma3MeanPooling
+from keras_hub.src.models.gemma3.gemma3_layers import RMSNormalization
 @keras_hub_export("keras_hub.models.Gemma3Backbone")
@@ -27,6 +25,11 @@ class Gemma3Backbone(Backbone):
     For a higher-level object for text-generation, see
     `keras_hub.models.Gemma3CausalLM`.
+    This backbone can also function as an end-to-end embedding model by
+    setting the `is_embedding_model` argument to `True`. When configured as an
+    embedding model with bi-directional attention, it matches the
+    `EmbeddingGemma` architecture.
     The default constructor gives a fully customizable, randomly initialized
     Gemma3 model with any vision encoder, number of heads, embedding dimensions,
     and equivalent configuration for the decoder layers. To load preset
@@ -70,6 +73,17 @@ class Gemma3Backbone(Backbone):
             in all transformer blocks. Defaults to `1e-6`.
         dropout: float. Dropout probability for the Transformer decoder blocks.
             Defaults to `0`.
+        is_embedding_model (bool, optional): If `True`, the model will function
+            as an embedding model. This adds mean pooling layer and a two-layer
+            dense projection head to the final sequence output. The model output
+            will be a dictionary containing `'sequence_output'` and
+            `'pooled_output'`. Defaults to `False`.
+        pooling_intermediate_dim (int, optional): The intermediate dimension of
+            the first dense layer in the two-layer pooling projection head.
+            Required if `is_embedding_model` is `True`. Defaults to `None`.
+        embedding_dim (int, optional): The dimension of the final projected
+            embedding. Required if `is_embedding_model` is `True`. Defaults to
+            `None`.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
@@ -198,6 +212,9 @@ class Gemma3Backbone(Backbone):
         layer_norm_epsilon=1e-6,
         use_bidirectional_attention=False,
         dropout=0,
+        is_embedding_model=False,
+        pooling_intermediate_dim=None,
+        embedding_dim=None,
         dtype=None,
         **kwargs,
     ):
@@ -319,6 +336,45 @@ class Gemma3Backbone(Backbone):
             )
         sequence_output = self.layer_norm(x)
+        if is_embedding_model:
+            if embedding_dim is None or pooling_intermediate_dim is None:
+                raise ValueError(
+                    "Must specify embedding_dim and pooling_intermediate_dim."
+                )
+            # 1. Mask-aware Mean Pooling
+            pooled_output = Gemma3MeanPooling(dtype=dtype, name="mean_pooling")(
+                sequence_output, padding_mask=padding_mask_input
+            )
+            # 2. First Projection (Non-linear or Linear depending on preset)
+            pooled_output = layers.Dense(
+                pooling_intermediate_dim,
+                dtype=dtype,
+                name="pooling_dense_1",
+                use_bias=False,
+            )(pooled_output)
+            # 3. Final Projection
+            pooled_output = layers.Dense(
+                embedding_dim,
+                dtype=dtype,
+                name="embedding_projection",
+                use_bias=False,
+            )(pooled_output)
+            # 4. L2 Normalization (Crucial for Retrieval)
+            pooled_output = layers.UnitNormalization(
+                axis=-1, dtype=dtype, name="unit_normalization"
+            )(pooled_output)
+            outputs = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+        else:
+            outputs = sequence_output
         inputs = {
             "token_ids": token_id_input,
             "padding_mask": padding_mask_input,
@@ -334,7 +390,7 @@ class Gemma3Backbone(Backbone):
         super().__init__(
             inputs=inputs,
-            outputs=sequence_output,
+            outputs=outputs,
             dtype=dtype,
             **kwargs,
         )
@@ -361,6 +417,9 @@ class Gemma3Backbone(Backbone):
         self.use_bidirectional_attention = use_bidirectional_attention
         self.layer_norm_epsilon = layer_norm_epsilon
         self.dropout = dropout
+        self.is_embedding_model = is_embedding_model
+        self.pooling_intermediate_dim = pooling_intermediate_dim
+        self.embedding_dim = embedding_dim
         # Keep `num_vision_tokens_per_image` as a backbone property for easy
         # access.
@@ -401,6 +460,9 @@ class Gemma3Backbone(Backbone):
                 "use_bidirectional_attention": self.use_bidirectional_attention,
                 "layer_norm_epsilon": self.layer_norm_epsilon,
                 "dropout": self.dropout,
+                "is_embedding_model": self.is_embedding_model,
+                "pooling_intermediate_dim": self.pooling_intermediate_dim,
+                "embedding_dim": self.embedding_dim,
             }
         )
         return config

keras_hub/src/models/gemma3/gemma3_causal_lm.py CHANGED Viewed

@@ -249,7 +249,22 @@ class Gemma3CausalLM(CausalLM):
             inputs.get("vision_mask", None),
             inputs.get("vision_indices", None),
         )
-        if not self.backbone.text_only_model:
+        # Determine if we have actual images to process.
+        # After preprocessing, images shape is (batch, num_images, h, w, 3).
+        # For text-only input, num_images=0 (static shape).
+        # We use static shape check which returns a Python int, not a tensor.
+        num_images = 0
+        if (
+            images is not None
+            and hasattr(images, "shape")
+            and len(images.shape) > 1
+        ):
+            num_images = images.shape[
+                1
+            ]  # Static shape, returns Python int or None
+        if not self.backbone.text_only_model and num_images:
             # Handle an unbatched image. Unlike `token_ids` and
             # `padding_mask`, this will not automatically be upranked.
             if len(ops.shape(images)) == 4:

keras_hub/src/models/gemma3/gemma3_decoder_block.py CHANGED Viewed

@@ -8,7 +8,7 @@ from keras_hub.src.layers.modeling.transformer_layer_utils import (
     merge_padding_and_attention_mask,
 )
 from keras_hub.src.models.gemma3.gemma3_attention import CachedGemma3Attention
-from keras_hub.src.models.gemma3.rms_normalization import RMSNormalization
+from keras_hub.src.models.gemma3.gemma3_layers import RMSNormalization
 class Gemma3DecoderBlock(keras.layers.Layer):
@@ -251,6 +251,11 @@ class Gemma3DecoderBlock(keras.layers.Layer):
         cache_update_mask=None,
     ):
         # Note: `vision_mask` is used only for Gemma3.
+        # If float16, we clamp the input to avoid overflow.
+        is_float16 = keras.backend.standardize_dtype(x.dtype) == "float16"
+        if is_float16:
+            x = ops.clip(x, -65504, 65504)
         normalized_x = self.pre_attention_norm(x)
         attention_mask = self._compute_attention_mask(
             normalized_x, padding_mask, vision_mask, cache, cache_update_index
@@ -275,7 +280,15 @@ class Gemma3DecoderBlock(keras.layers.Layer):
         if self.dropout:
             attention = self.attention_dropout(attention)
-        attention_x = x + attention
+        if is_float16:
+            attention_x = ops.add(
+                ops.cast(x, "float32"), ops.cast(attention, "float32")
+            )
+            attention_x = ops.clip(attention_x, -65504, 65504)
+            attention_x = ops.cast(attention_x, "float16")
+        else:
+            attention_x = x + attention
         normalized_x = self.pre_ffw_norm(attention_x)
         x1 = self.gating_ffw(normalized_x)
@@ -286,7 +299,14 @@ class Gemma3DecoderBlock(keras.layers.Layer):
         if self.use_post_ffw_norm:
             x = self.post_ffw_norm(x)
-        x = x + attention_x
+        if is_float16:
+            x = ops.add(
+                ops.cast(x, "float32"), ops.cast(attention_x, "float32")
+            )
+            x = ops.clip(x, -65504, 65504)
+            x = ops.cast(x, "float16")
+        else:
+            x = x + attention_x
         if cache is not None:
             return x, new_cache

keras-hub 0.25.0.dev0__py3-none-any.whl → 0.26.0.dev0__py3-none-any.whl

keras-hub 0.25.0.dev0py3-none-any.whl → 0.26.0.dev0py3-none-any.whl