PyPI - keras-hub - Versions diffs - 0.20.0.dev1__py3-none-any.whl → 0.21.0.dev1__py3-none-any.whl - Mend

keras-hub 0.20.0.dev1py3-none-any.whl → 0.21.0.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

keras_hub/__init__.py +15 -33
keras_hub/layers/__init__.py +134 -0
keras_hub/metrics/__init__.py +11 -0
keras_hub/models/__init__.py +642 -0
keras_hub/samplers/__init__.py +18 -0
keras_hub/src/layers/modeling/reversible_embedding.py +25 -35
keras_hub/src/layers/preprocessing/image_converter.py +1 -0
keras_hub/src/layers/preprocessing/random_deletion.py +1 -1
keras_hub/src/layers/preprocessing/random_swap.py +1 -1
keras_hub/src/models/audio_to_text.py +66 -0
keras_hub/src/models/audio_to_text_preprocessor.py +80 -0
keras_hub/src/models/backbone.py +5 -2
keras_hub/src/models/cspnet/cspnet_backbone.py +51 -26
keras_hub/src/models/cspnet/cspnet_presets.py +38 -3
keras_hub/src/models/falcon/falcon_backbone.py +1 -1
keras_hub/src/models/gemma/gemma_presets.py +10 -10
keras_hub/src/models/gemma3/gemma3_causal_lm_preprocessor.py +3 -2
keras_hub/src/models/gemma3/gemma3_presets.py +8 -8
keras_hub/src/models/gemma3/gemma3_vision_encoder.py +1 -1
keras_hub/src/models/llama/llama_attention.py +24 -6
keras_hub/src/models/llama/llama_backbone.py +50 -16
keras_hub/src/models/llama/llama_decoder.py +20 -3
keras_hub/src/models/llama/llama_presets.py +3 -3
keras_hub/src/models/llama/llama_rotary_embedding.py +180 -0
keras_hub/src/models/llama3/llama3_backbone.py +10 -2
keras_hub/src/models/llama3/llama3_presets.py +84 -2
keras_hub/src/models/mistral/mistral_presets.py +3 -3
keras_hub/src/models/mixtral/__init__.py +5 -0
keras_hub/src/models/mixtral/mixtral_attention.py +252 -0
keras_hub/src/models/mixtral/mixtral_backbone.py +207 -0
keras_hub/src/models/mixtral/mixtral_causal_lm.py +281 -0
keras_hub/src/models/mixtral/mixtral_causal_lm_preprocessor.py +76 -0
keras_hub/src/models/mixtral/mixtral_decoder.py +494 -0
keras_hub/src/models/mixtral/mixtral_layer_norm.py +34 -0
keras_hub/src/models/mixtral/mixtral_presets.py +26 -0
keras_hub/src/models/mixtral/mixtral_tokenizer.py +21 -0
keras_hub/src/models/moonshine/__init__.py +5 -0
keras_hub/src/models/moonshine/moonshine_audio_converter.py +301 -0
keras_hub/src/models/moonshine/moonshine_audio_to_text.py +383 -0
keras_hub/src/models/moonshine/moonshine_audio_to_text_preprocessor.py +272 -0
keras_hub/src/models/moonshine/moonshine_backbone.py +478 -0
keras_hub/src/models/moonshine/moonshine_decoder.py +313 -0
keras_hub/src/models/moonshine/moonshine_encoder.py +212 -0
keras_hub/src/models/moonshine/moonshine_layers.py +239 -0
keras_hub/src/models/moonshine/moonshine_multi_head_attention.py +355 -0
keras_hub/src/models/moonshine/moonshine_presets.py +25 -0
keras_hub/src/models/moonshine/moonshine_tokenizer.py +62 -0
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +11 -11
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +1 -1
keras_hub/src/models/qwen/__init__.py +4 -0
keras_hub/src/models/qwen/qwen_attention.py +3 -1
keras_hub/src/models/qwen/qwen_backbone.py +8 -1
keras_hub/src/models/qwen/qwen_causal_lm.py +7 -0
keras_hub/src/models/qwen/qwen_causal_lm_preprocessor.py +7 -0
keras_hub/src/models/qwen/qwen_presets.py +61 -0
keras_hub/src/models/qwen/qwen_tokenizer.py +9 -0
keras_hub/src/models/qwen_moe/__init__.py +5 -0
keras_hub/src/models/qwen_moe/qwen_moe_attention.py +375 -0
keras_hub/src/models/qwen_moe/qwen_moe_backbone.py +373 -0
keras_hub/src/models/qwen_moe/qwen_moe_causal_lm.py +350 -0
keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor.py +17 -0
keras_hub/src/models/qwen_moe/qwen_moe_decoder.py +625 -0
keras_hub/src/models/qwen_moe/qwen_moe_layernorm.py +32 -0
keras_hub/src/models/qwen_moe/qwen_moe_presets.py +15 -0
keras_hub/src/models/qwen_moe/qwen_moe_tokenizer.py +46 -0
keras_hub/src/models/retinanet/retinanet_image_converter.py +0 -13
keras_hub/src/models/retinanet/retinanet_presets.py +2 -2
keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py +0 -18
keras_hub/src/models/segformer/segformer_presets.py +12 -12
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +6 -0
keras_hub/src/models/task.py +5 -2
keras_hub/src/models/xception/__init__.py +5 -0
keras_hub/src/models/xception/xception_backbone.py +188 -0
keras_hub/src/models/xception/xception_image_classifier.py +12 -0
keras_hub/src/models/xception/xception_image_classifier_preprocessor.py +14 -0
keras_hub/src/models/xception/xception_image_converter.py +8 -0
keras_hub/src/models/xception/xception_presets.py +14 -0
keras_hub/src/tests/mocks/mock_gemma3_tokenizer.py +155 -0
keras_hub/src/utils/coco/__init__.py +0 -0
keras_hub/src/utils/coco/coco_utils.py +133 -0
keras_hub/src/utils/imagenet/imagenet_utils.py +36 -0
keras_hub/src/utils/keras_utils.py +11 -0
keras_hub/src/utils/preset_utils.py +70 -10
keras_hub/src/utils/tensor_utils.py +27 -1
keras_hub/src/utils/timm/convert_cspnet.py +94 -23
keras_hub/src/utils/timm/preset_loader.py +6 -6
keras_hub/src/utils/transformers/convert_llama3.py +21 -1
keras_hub/src/utils/transformers/convert_mixtral.py +139 -0
keras_hub/src/utils/transformers/convert_qwen.py +1 -0
keras_hub/src/utils/transformers/convert_qwen_moe.py +253 -0
keras_hub/src/utils/transformers/preset_loader.py +6 -0
keras_hub/src/{version_utils.py → version.py} +1 -1
keras_hub/tokenizers/__init__.py +117 -0
keras_hub/utils/__init__.py +21 -0
{keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/METADATA +6 -20
{keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/RECORD +98 -55
{keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/WHEEL +1 -1
keras_hub/api/__init__.py +0 -15
keras_hub/api/layers/__init__.py +0 -86
keras_hub/api/metrics/__init__.py +0 -11
keras_hub/api/models/__init__.py +0 -416
keras_hub/api/samplers/__init__.py +0 -16
keras_hub/api/tokenizers/__init__.py +0 -58
keras_hub/api/utils/__init__.py +0 -9
{keras_hub-0.20.0.dev1.dist-info → keras_hub-0.21.0.dev1.dist-info}/top_level.txt +0 -0

keras_hub/src/models/pali_gemma/pali_gemma_presets.py CHANGED Viewed

@@ -81,7 +81,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_ft_docci_10b_448/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_ft_docci_10b_448/3",
     },
     "pali_gemma2_mix_3b_224": {
         "metadata": {
@@ -126,7 +126,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_10b_224/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_10b_224/3",
     },
     "pali_gemma2_mix_10b_448": {
         "metadata": {
@@ -141,7 +141,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_10b_448/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_10b_448/3",
     },
     "pali_gemma2_mix_28b_224": {
         "metadata": {
@@ -156,7 +156,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_28b_mix_224/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_28b_mix_224/3",
     },
     "pali_gemma2_mix_28b_448": {
         "metadata": {
@@ -171,7 +171,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_28b_mix_448/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_28b_mix_448/3",
     },
     "pali_gemma2_pt_3b_224": {
         "metadata": {
@@ -231,7 +231,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_10b_224/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_10b_224/3",
     },
     "pali_gemma2_pt_10b_448": {
         "metadata": {
@@ -246,7 +246,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_10b_448/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_10b_448/3",
     },
     "pali_gemma2_pt_10b_896": {
         "metadata": {
@@ -261,7 +261,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_10b_896/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_10b_896/3",
     },
     "pali_gemma2_pt_28b_224": {
         "metadata": {
@@ -276,7 +276,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_28b_224/3",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_28b_224/4",
     },
     "pali_gemma2_pt_28b_448": {
         "metadata": {
@@ -291,7 +291,7 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_28b_448/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_28b_448/3",
     },
     "pali_gemma2_pt_28b_896": {
         "metadata": {
@@ -306,6 +306,6 @@ backbone_presets = {
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
         },
-        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_28b_896/2",
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_pt_28b_896/3",
     },
 }

keras_hub/src/models/pali_gemma/pali_gemma_vit.py CHANGED Viewed

@@ -329,7 +329,7 @@ class PaliGemmaVitEncoder(keras.layers.Layer):
             # Fix the compatibility issue with Keras 3.1 where
             # `compute_output_spec` fails to propagate `inputs_shape`
             # correctly, causing it to be `None`.
-            inputs_shape = [None, None, None]
+            return [None, None, self.hidden_dim]
         return [
             inputs_shape[0],
             (inputs_shape[1] // self.patch_size) ** 2,

keras_hub/src/models/qwen/__init__.py CHANGED Viewed

@@ -1 +1,5 @@
 from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone
+from keras_hub.src.models.qwen.qwen_presets import backbone_presets
+from keras_hub.src.utils.preset_utils import register_presets
+register_presets(backbone_presets, QwenBackbone)

keras_hub/src/models/qwen/qwen_attention.py CHANGED Viewed

@@ -287,7 +287,9 @@ class QwenAttention(keras.layers.Layer):
         if self.use_sliding_window_attention:
             attention_mask = self._mask_sliding_window(
                 attention_mask,
-                cache_update_index=cache_update_index,
+                cache_update_index=cache_update_index
+                if cache_update_index
+                else 0,
             )
         attention_scores = self._masked_softmax(
             attention_scores, attention_mask

keras_hub/src/models/qwen/qwen_backbone.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import keras
 from keras import ops
+from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.modeling.reversible_embedding import (
     ReversibleEmbedding,
 )
@@ -13,6 +14,12 @@ def _qwen_kernel_initializer(stddev=0.02):
     return keras.initializers.RandomNormal(stddev=stddev)
+@keras_hub_export(
+    [
+        "keras_hub.models.QwenBackbone",
+        "keras_hub.models.Qwen2Backbone",
+    ]
+)
 class QwenBackbone(Backbone):
     """
     The Qwen Transformer core architecture with hyperparameters.
@@ -168,7 +175,7 @@ class QwenBackbone(Backbone):
         self.layer_norm_epsilon = layer_norm_epsilon
         self.dropout = dropout
         self.tie_word_embeddings = tie_word_embeddings
-        self.use_sliding_window_attention = (use_sliding_window_attention,)
+        self.use_sliding_window_attention = use_sliding_window_attention
         self.sliding_window_size = sliding_window_size
     def get_config(self):

keras_hub/src/models/qwen/qwen_causal_lm.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import keras
 from keras import ops
+from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.causal_lm import CausalLM
 from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone
 from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
@@ -9,6 +10,12 @@ from keras_hub.src.models.qwen.qwen_causal_lm_preprocessor import (
 from keras_hub.src.utils.tensor_utils import any_equal
+@keras_hub_export(
+    [
+        "keras_hub.models.QwenCausalLM",
+        "keras_hub.models.Qwen2CausalLM",
+    ]
+)
 class QwenCausalLM(CausalLM):
     backbone_cls = QwenBackbone
     preprocessor_cls = QwenCausalLMPreprocessor

keras_hub/src/models/qwen/qwen_causal_lm_preprocessor.py CHANGED Viewed

@@ -1,8 +1,15 @@
+from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
 from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone
 from keras_hub.src.models.qwen.qwen_tokenizer import QwenTokenizer
+@keras_hub_export(
+    [
+        "keras_hub.models.QwenCausalLMPreprocessor",
+        "keras_hub.models.Qwen2CausalLMPreprocessor",
+    ]
+)
 class QwenCausalLMPreprocessor(CausalLMPreprocessor):
     backbone_cls = QwenBackbone
     tokenizer_cls = QwenTokenizer

keras_hub/src/models/qwen/qwen_presets.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Qwen preset configurations."""
+backbone_presets = {
+    "qwen2.5_0.5b_en": {
+        "metadata": {
+            "description": ("24-layer Qwen model with 0.5 billion parameters."),
+            "params": 494032768,
+            "path": "qwen",
+        },
+        "kaggle_handle": "kaggle://keras/qwen/keras/qwen2.5_0.5b_en/1",
+    },
+    "qwen2.5_3b_en": {
+        "metadata": {
+            "description": ("36-layer Qwen model with 3.1 billion parameters."),
+            "params": 3085938688,
+            "path": "qwen",
+        },
+        "kaggle_handle": "kaggle://keras/qwen/keras/qwen2.5_3b_en/1",
+    },
+    "qwen2.5_7b_en": {
+        "metadata": {
+            "description": ("48-layer Qwen model with 7 billion parameters."),
+            "params": 6993420288,
+            "path": "qwen",
+        },
+        "kaggle_handle": "kaggle://keras/qwen/keras/qwen2.5_7b_en/3",
+    },
+    "qwen2.5_instruct_0.5b_en": {
+        "metadata": {
+            "description": (
+                "Instruction fine-tuned 24-layer Qwen model with 0.5 ",
+                "billion parameters.",
+            ),
+            "params": 494032768,
+            "path": "qwen",
+        },
+        "kaggle_handle": "kaggle://keras/qwen/keras/qwen2.5_instruct_0.5b_en/1",
+    },
+    "qwen2.5_instruct_32b_en": {
+        "metadata": {
+            "description": (
+                "Instruction fine-tuned 64-layer Qwen model with 32 ",
+                "billion parameters.",
+            ),
+            "params": 32763876352,
+            "path": "qwen",
+        },
+        "kaggle_handle": "kaggle://keras/qwen/keras/qwen2.5_instruct_32b_en/2",
+    },
+    "qwen2.5_instruct_72b_en": {
+        "metadata": {
+            "description": (
+                "Instruction fine-tuned 80-layer Qwen model with 72 ",
+                "billion parameters.",
+            ),
+            "params": 72706203648,
+            "path": "qwen",
+        },
+        "kaggle_handle": "kaggle://keras/qwen/keras/qwen2.5_instruct_72b_en/2",
+    },
+}

keras_hub/src/models/qwen/qwen_tokenizer.py CHANGED Viewed

@@ -1,7 +1,16 @@
+from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.qwen.qwen_backbone import QwenBackbone
 from keras_hub.src.tokenizers.byte_pair_tokenizer import BytePairTokenizer
+@keras_hub_export(
+    [
+        "keras_hub.tokenizers.QwenTokenizer",
+        "keras_hub.tokenizers.Qwen2Tokenizer",
+        "keras_hub.models.QwenTokenizer",
+        "keras_hub.models.Qwen2Tokenizer",
+    ]
+)
 class QwenTokenizer(BytePairTokenizer):
     """Tokenizer for Qwen models.

keras_hub/src/models/qwen_moe/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from keras_hub.src.models.qwen_moe.qwen_moe_backbone import QwenMoeBackbone
+from keras_hub.src.models.qwen_moe.qwen_moe_presets import backbone_presets
+from keras_hub.src.utils.preset_utils import register_presets
+register_presets(backbone_presets, QwenMoeBackbone)

keras_hub/src/models/qwen_moe/qwen_moe_attention.py ADDED Viewed

@@ -0,0 +1,375 @@
+import inspect
+import math
+import keras
+from keras import ops
+from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
+from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import fused_attention_op_available
+from keras_hub.src.utils.keras_utils import gpu_supports_fused_attention_op
+from keras_hub.src.utils.keras_utils import running_on_gpu
+from keras_hub.src.utils.keras_utils import running_on_tpu
+class QwenMoeAttention(keras.layers.Layer):
+    """A multi-head attention layer for Qwen-Moe model
+    This attention implementation supports grouped-query attention (GQA) where
+    the number of key-value heads can be less than the number of query heads.
+    Args:
+        num_query_heads: Number of query heads.
+        num_key_value_heads: Number of key/value heads (for GQA).
+        rope_max_wavelength: Maximum wavelength for RoPE (Rotary Position
+            Embedding).
+        rope_scaling_factor: Scaling factor for RoPE, used for extending
+            context length.
+        kernel_initializer: Initializer for the kernel weights.
+        bias_initializer: Initializer for the bias weights.
+        dropout: Dropout rate for attention weights.
+        use_sliding_window_attention: Whether to use sliding window
+            attention.
+        sliding_window_size: Size of the sliding window for attention.
+        **kwargs: Additional keyword arguments to pass to the Layer.
+    """
+    def __init__(
+        self,
+        num_query_heads,
+        num_key_value_heads,
+        rope_max_wavelength=10000,
+        rope_scaling_factor=1,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        dropout=0,
+        use_sliding_window_attention=False,
+        sliding_window_size=4096,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+        self.num_query_heads = num_query_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.dropout = dropout
+        self.num_key_value_groups = num_query_heads // num_key_value_heads
+        self.rope_max_wavelength = rope_max_wavelength
+        self.kernel_initializer = keras.initializers.get(
+            clone_initializer(kernel_initializer)
+        )
+        self.bias_initializer = keras.initializers.get(
+            clone_initializer(bias_initializer)
+        )
+        self.rope_scaling_factor = rope_scaling_factor
+        self.use_sliding_window_attention = use_sliding_window_attention
+        self.sliding_window_size = sliding_window_size
+    def build(self, inputs_shape):
+        # Einsum variables:
+        # b = batch size
+        # q = query length
+        # k = key/value length
+        # m = model dim
+        # u = num query heads
+        # v = num key/value heads
+        # h = head dim
+        hidden_dim = inputs_shape[-1]
+        head_dim = hidden_dim // self.num_query_heads
+        self._inv_norm_factor = 1.0 / math.sqrt(head_dim)
+        self.query_dense = keras.layers.EinsumDense(
+            equation="bqm,muh->bquh",
+            output_shape=(None, self.num_query_heads, head_dim),
+            kernel_initializer=self.kernel_initializer,
+            bias_initializer=self.bias_initializer,
+            bias_axes="uh",
+            dtype=self.dtype_policy,
+            name="query",
+        )
+        self.query_dense.build(inputs_shape)
+        self.key_dense = keras.layers.EinsumDense(
+            equation="bkm,mvh->bkvh",
+            output_shape=(
+                None,
+                self.num_key_value_heads,
+                head_dim,
+            ),
+            kernel_initializer=self.kernel_initializer,
+            bias_initializer=self.bias_initializer,
+            bias_axes="vh",
+            dtype=self.dtype_policy,
+            name="key",
+        )
+        self.key_dense.build(inputs_shape)
+        self.value_dense = keras.layers.EinsumDense(
+            equation="bkm,mvh->bkvh",
+            output_shape=(
+                None,
+                self.num_key_value_heads,
+                head_dim,
+            ),
+            kernel_initializer=self.kernel_initializer,
+            bias_initializer=self.bias_initializer,
+            bias_axes="vh",
+            dtype=self.dtype_policy,
+            name="value",
+        )
+        self.value_dense.build(inputs_shape)
+        self._softmax = keras.layers.Softmax(
+            axis=-1,
+            dtype="float32",
+            name="attention_softmax",
+        )
+        self._dropout_layer = keras.layers.Dropout(
+            rate=self.dropout,
+            dtype=self.dtype_policy,
+        )
+        self._output_dense = keras.layers.EinsumDense(
+            equation="bquh,uhm->bqm",
+            output_shape=(None, hidden_dim),
+            kernel_initializer=self.kernel_initializer,
+            dtype=self.dtype_policy,
+            name="attention_output",
+        )
+        self._output_dense.build((None, None, self.num_query_heads, head_dim))
+        self.rotary_embedding_layer = RotaryEmbedding(
+            max_wavelength=self.rope_max_wavelength,
+            scaling_factor=self.rope_scaling_factor,
+            dtype=self.dtype_policy,
+        )
+        self._dot_product_equation = "bquh,bkuh->buqk"
+        self._combine_equation = "buqk,bkuh->bquh"
+        self.built = True
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        cache=None,
+        cache_update_index=None,
+        training=None,
+    ):
+        """Applies attention mechanism to the input hidden states.
+        Args:
+            hidden_states: Input tensor of shape [batch_size, seq_length,
+                hidden_size].
+            attention_mask: Mask tensor of shape [batch_size, seq_length,
+                seq_length].
+            cache: Optional cached key and value tensors.
+            cache_update_index: Index at which to update the cache.
+            training: Boolean indicating whether in training mode.
+        Returns:
+            attention_output: Output tensor after applying attention.
+            cache: Updated cache tensors (if cache is provided).
+        """
+        start_index = (
+            cache_update_index if cache_update_index is not None else 0
+        )
+        query = self.query_dense(hidden_states)
+        # Compute RoPE for queries
+        query = self.rotary_embedding_layer(query, start_index=start_index)
+        def _compute_key_value(x):
+            key, value = self.key_dense(x), self.value_dense(x)
+            # Compute RoPE for keys
+            key = self.rotary_embedding_layer(key, start_index=start_index)
+            return key, value
+        if cache is not None:
+            key_cache = cache[:, 0, ...]
+            value_cache = cache[:, 1, ...]
+            if cache_update_index is None:
+                key = key_cache
+                value = value_cache
+            else:
+                key_update, value_update = _compute_key_value(hidden_states)
+                start = [0, cache_update_index, 0, 0]
+                key = ops.slice_update(key_cache, start, key_update)
+                value = ops.slice_update(value_cache, start, value_update)
+                cache = ops.stack((key, value), axis=1)
+        else:
+            if cache_update_index is not None:
+                raise ValueError(
+                    "`cache_update_index` should not be set if `cache` is "
+                    f"`None`. Received: cache={cache}, "
+                    f"cache_update_index={cache_update_index}"
+                )
+            key, value = _compute_key_value(hidden_states)
+        # [batch_shape, seq_len, num_key_value_heads, head_dim]
+        # -> [batch_shape, seq_len, num_heads, head_dim]
+        key = ops.repeat(key, repeats=self.num_key_value_groups, axis=2)
+        value = ops.repeat(value, repeats=self.num_key_value_groups, axis=2)
+        attention_output = self._compute_attention(
+            query,
+            key,
+            value,
+            attention_mask,
+            cache_update_index=cache_update_index,
+        )
+        attention_output = self._dropout_layer(
+            attention_output, training=training
+        )
+        attention_output = self._output_dense(attention_output)
+        if cache is not None:
+            return attention_output, cache
+        return attention_output
+    def _masked_softmax(self, attention_scores, attention_mask=None):
+        """Applies softmax with optional masking.
+        Args:
+            attention_scores: Attention score tensor.
+            attention_mask: Optional mask tensor.
+        Returns:
+            Masked softmax attention weights.
+        """
+        if attention_mask is not None:
+            return self._softmax(
+                attention_scores, attention_mask[:, None, :, :]
+            )
+        return self._softmax(attention_scores)
+    def _use_fused_attention_op(self):
+        if not fused_attention_op_available():
+            return False
+        if self.dropout > 0.0:
+            return False
+        if running_on_gpu():
+            return gpu_supports_fused_attention_op()
+        elif running_on_tpu():
+            # TPU supports softcap with on keras >= 3.10.
+            sig = inspect.signature(ops.dot_product_attention)
+            return "attn_logits_soft_cap" in sig.parameters
+        else:
+            return False
+    def _compute_attention(
+        self,
+        query,
+        key,
+        value,
+        attention_mask=None,
+        cache_update_index=None,
+        **kwargs,
+    ):
+        """Computes attention using query, key, and value tensors.
+        Uses Flash Attention when available for better performance.
+        Args:
+            query: Query tensor.
+            key: Key tensor.
+            value: Value tensor.
+            attention_mask: Optional mask tensor.
+            cache_update_index: Index for sliding window computation.
+        Returns:
+            attention_output: Output tensor after applying attention.
+        """
+        if self._use_fused_attention_op():
+            if attention_mask is not None:
+                attention_mask = ops.expand_dims(attention_mask, axis=1)
+                attention_mask = ops.cast(attention_mask, dtype="bool")
+            attention_output = ops.dot_product_attention(
+                query,
+                key,
+                value,
+                mask=attention_mask,
+                scale=self._inv_norm_factor,
+                **kwargs,
+            )
+            return attention_output
+        attention_scores = ops.einsum(self._dot_product_equation, query, key)
+        attention_scores = ops.multiply(
+            attention_scores,
+            ops.cast(self._inv_norm_factor, self.compute_dtype),
+        )
+        if self.use_sliding_window_attention:
+            attention_mask = self._mask_sliding_window(
+                attention_mask,
+                cache_update_index=cache_update_index
+                if cache_update_index
+                else 0,
+            )
+        attention_scores = self._masked_softmax(
+            attention_scores, attention_mask
+        )
+        attention_scores = ops.cast(attention_scores, self.compute_dtype)
+        attention_output = ops.einsum(
+            self._combine_equation, attention_scores, value
+        )
+        return attention_output
+    def _mask_sliding_window(
+        self,
+        attention_mask,
+        cache_update_index=0,
+    ):
+        """Creates and combines a sliding window mask with the attention mask.
+        Args:
+            attention_mask: Original attention mask.
+            cache_update_index: Starting index for the sliding window.
+        Returns:
+            Combined attention mask with sliding window constraints.
+        """
+        _, query_len, key_len = ops.shape(attention_mask)
+        # Compute the sliding window for square attention.
+        all_ones = ops.ones((key_len, key_len), "bool")
+        sliding_mask = ops.triu(
+            all_ones, -1 * self.sliding_window_size + 1
+        ) * ops.tril(all_ones, self.sliding_window_size - 1)
+        # Slice the window for short queries during generation.
+        start = (cache_update_index, 0)
+        sliding_mask = ops.slice(sliding_mask, start, (query_len, key_len))
+        sliding_mask = ops.expand_dims(sliding_mask, 0)
+        return ops.logical_and(attention_mask, ops.cast(sliding_mask, "bool"))
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_query_heads": self.num_query_heads,
+                "num_key_value_heads": self.num_key_value_heads,
+                "rope_max_wavelength": self.rope_max_wavelength,
+                "rope_scaling_factor": self.rope_scaling_factor,
+                "kernel_initializer": keras.initializers.serialize(
+                    self.kernel_initializer
+                ),
+                "bias_initializer": keras.initializers.serialize(
+                    self.bias_initializer
+                ),
+                "dropout": self.dropout,
+                "use_sliding_window_attention": (
+                    self.use_sliding_window_attention
+                ),
+                "sliding_window_size": self.sliding_window_size,
+            }
+        )
+        return config

keras-hub 0.20.0.dev1__py3-none-any.whl → 0.21.0.dev1__py3-none-any.whl

keras-hub 0.20.0.dev1py3-none-any.whl → 0.21.0.dev1py3-none-any.whl