PyPI - keras-hub-nightly - Versions diffs - 0.16.1.dev202410200345__py3-none-any.whl → 0.19.0.dev202412070351__py3-none-any.whl - Mend

keras-hub-nightly 0.16.1.dev202410200345py3-none-any.whl → 0.19.0.dev202412070351py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

keras_hub/api/layers/__init__.py +12 -0
keras_hub/api/models/__init__.py +32 -0
keras_hub/src/bounding_box/__init__.py +2 -0
keras_hub/src/bounding_box/converters.py +102 -12
keras_hub/src/layers/modeling/rms_normalization.py +34 -0
keras_hub/src/layers/modeling/transformer_encoder.py +27 -7
keras_hub/src/layers/preprocessing/image_converter.py +5 -0
keras_hub/src/models/albert/albert_presets.py +0 -8
keras_hub/src/models/bart/bart_presets.py +0 -6
keras_hub/src/models/bert/bert_presets.py +0 -20
keras_hub/src/models/bloom/bloom_presets.py +0 -16
keras_hub/src/models/clip/__init__.py +5 -0
keras_hub/src/models/clip/clip_backbone.py +286 -0
keras_hub/src/models/clip/clip_encoder_block.py +19 -4
keras_hub/src/models/clip/clip_image_converter.py +8 -0
keras_hub/src/models/clip/clip_presets.py +93 -0
keras_hub/src/models/clip/clip_text_encoder.py +4 -1
keras_hub/src/models/clip/clip_tokenizer.py +18 -3
keras_hub/src/models/clip/clip_vision_embedding.py +101 -0
keras_hub/src/models/clip/clip_vision_encoder.py +159 -0
keras_hub/src/models/deberta_v3/deberta_v3_presets.py +0 -10
keras_hub/src/models/deeplab_v3/deeplab_v3_presets.py +0 -2
keras_hub/src/models/deeplab_v3/deeplab_v3_segmenter.py +5 -3
keras_hub/src/models/densenet/densenet_backbone.py +1 -1
keras_hub/src/models/densenet/densenet_presets.py +0 -6
keras_hub/src/models/distil_bert/distil_bert_presets.py +0 -6
keras_hub/src/models/efficientnet/__init__.py +9 -0
keras_hub/src/models/efficientnet/cba.py +141 -0
keras_hub/src/models/efficientnet/efficientnet_backbone.py +139 -56
keras_hub/src/models/efficientnet/efficientnet_image_classifier.py +14 -0
keras_hub/src/models/efficientnet/efficientnet_image_classifier_preprocessor.py +16 -0
keras_hub/src/models/efficientnet/efficientnet_image_converter.py +10 -0
keras_hub/src/models/efficientnet/efficientnet_presets.py +192 -0
keras_hub/src/models/efficientnet/fusedmbconv.py +81 -36
keras_hub/src/models/efficientnet/mbconv.py +52 -21
keras_hub/src/models/electra/electra_presets.py +0 -12
keras_hub/src/models/f_net/f_net_presets.py +0 -4
keras_hub/src/models/falcon/falcon_presets.py +0 -2
keras_hub/src/models/flux/__init__.py +5 -0
keras_hub/src/models/flux/flux_layers.py +494 -0
keras_hub/src/models/flux/flux_maths.py +218 -0
keras_hub/src/models/flux/flux_model.py +231 -0
keras_hub/src/models/flux/flux_presets.py +14 -0
keras_hub/src/models/flux/flux_text_to_image.py +142 -0
keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +73 -0
keras_hub/src/models/gemma/gemma_presets.py +0 -40
keras_hub/src/models/gpt2/gpt2_presets.py +0 -9
keras_hub/src/models/image_object_detector.py +87 -0
keras_hub/src/models/image_object_detector_preprocessor.py +57 -0
keras_hub/src/models/image_to_image.py +16 -10
keras_hub/src/models/inpaint.py +20 -13
keras_hub/src/models/llama/llama_backbone.py +1 -1
keras_hub/src/models/llama/llama_presets.py +5 -15
keras_hub/src/models/llama3/llama3_presets.py +0 -8
keras_hub/src/models/mistral/mistral_presets.py +0 -6
keras_hub/src/models/mit/mit_backbone.py +41 -27
keras_hub/src/models/mit/mit_layers.py +9 -7
keras_hub/src/models/mit/mit_presets.py +12 -24
keras_hub/src/models/opt/opt_presets.py +0 -8
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +61 -11
keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +21 -23
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +166 -10
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +12 -11
keras_hub/src/models/phi3/phi3_presets.py +0 -4
keras_hub/src/models/resnet/resnet_presets.py +10 -42
keras_hub/src/models/retinanet/__init__.py +5 -0
keras_hub/src/models/retinanet/anchor_generator.py +52 -53
keras_hub/src/models/retinanet/feature_pyramid.py +99 -36
keras_hub/src/models/retinanet/non_max_supression.py +1 -0
keras_hub/src/models/retinanet/prediction_head.py +192 -0
keras_hub/src/models/retinanet/retinanet_backbone.py +146 -0
keras_hub/src/models/retinanet/retinanet_image_converter.py +53 -0
keras_hub/src/models/retinanet/retinanet_label_encoder.py +49 -51
keras_hub/src/models/retinanet/retinanet_object_detector.py +382 -0
keras_hub/src/models/retinanet/retinanet_object_detector_preprocessor.py +14 -0
keras_hub/src/models/retinanet/retinanet_presets.py +15 -0
keras_hub/src/models/roberta/roberta_presets.py +0 -4
keras_hub/src/models/sam/sam_backbone.py +0 -1
keras_hub/src/models/sam/sam_image_segmenter.py +9 -10
keras_hub/src/models/sam/sam_presets.py +0 -6
keras_hub/src/models/segformer/__init__.py +8 -0
keras_hub/src/models/segformer/segformer_backbone.py +163 -0
keras_hub/src/models/segformer/segformer_image_converter.py +8 -0
keras_hub/src/models/segformer/segformer_image_segmenter.py +171 -0
keras_hub/src/models/segformer/segformer_image_segmenter_preprocessor.py +31 -0
keras_hub/src/models/segformer/segformer_presets.py +124 -0
keras_hub/src/models/stable_diffusion_3/mmdit.py +41 -0
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +38 -21
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +3 -3
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +3 -3
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py +28 -4
keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +1 -1
keras_hub/src/models/t5/t5_backbone.py +5 -4
keras_hub/src/models/t5/t5_presets.py +41 -13
keras_hub/src/models/text_to_image.py +13 -5
keras_hub/src/models/vgg/vgg_backbone.py +1 -1
keras_hub/src/models/vgg/vgg_presets.py +0 -8
keras_hub/src/models/whisper/whisper_audio_converter.py +1 -1
keras_hub/src/models/whisper/whisper_presets.py +0 -20
keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +0 -4
keras_hub/src/tests/test_case.py +25 -0
keras_hub/src/utils/preset_utils.py +17 -4
keras_hub/src/utils/timm/convert_efficientnet.py +449 -0
keras_hub/src/utils/timm/preset_loader.py +3 -0
keras_hub/src/version_utils.py +1 -1
{keras_hub_nightly-0.16.1.dev202410200345.dist-info → keras_hub_nightly-0.19.0.dev202412070351.dist-info}/METADATA +15 -26
{keras_hub_nightly-0.16.1.dev202410200345.dist-info → keras_hub_nightly-0.19.0.dev202412070351.dist-info}/RECORD +109 -76
{keras_hub_nightly-0.16.1.dev202410200345.dist-info → keras_hub_nightly-0.19.0.dev202412070351.dist-info}/WHEEL +1 -1
{keras_hub_nightly-0.16.1.dev202410200345.dist-info → keras_hub_nightly-0.19.0.dev202412070351.dist-info}/top_level.txt +0 -0

keras_hub/src/models/mistral/mistral_presets.py CHANGED Viewed

@@ -6,9 +6,7 @@ backbone_presets = {
         "metadata": {
             "description": "Mistral 7B base model",
             "params": 7241732096,
-            "official_name": "Mistral",
             "path": "mistral",
-            "model_card": "https://github.com/mistralai/mistral-src/blob/main/README.md",
         },
         "kaggle_handle": "kaggle://keras/mistral/keras/mistral_7b_en/6",
     },
@@ -16,9 +14,7 @@ backbone_presets = {
         "metadata": {
             "description": "Mistral 7B instruct model",
             "params": 7241732096,
-            "official_name": "Mistral",
             "path": "mistral",
-            "model_card": "https://github.com/mistralai/mistral-src/blob/main/README.md",
         },
         "kaggle_handle": "kaggle://keras/mistral/keras/mistral_instruct_7b_en/6",
     },
@@ -26,9 +22,7 @@ backbone_presets = {
         "metadata": {
             "description": "Mistral 7B instruct Version 0.2 model",
             "params": 7241732096,
-            "official_name": "Mistral",
             "path": "mistral",
-            "model_card": "https://github.com/mistralai/mistral-src/blob/main/README.md",
         },
         "kaggle_handle": "kaggle://keras/mistral/keras/mistral_0.2_instruct_7b_en/1",
     },

keras_hub/src/models/mit/mit_backbone.py CHANGED Viewed

@@ -1,3 +1,14 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import keras
 import numpy as np
 from keras import ops
@@ -12,13 +23,13 @@ from keras_hub.src.models.mit.mit_layers import OverlappingPatchingAndEmbedding
 class MiTBackbone(FeaturePyramidBackbone):
     def __init__(
         self,
-        depths,
+        layerwise_depths,
         num_layers,
-        blockwise_num_heads,
-        blockwise_sr_ratios,
+        layerwise_num_heads,
+        layerwise_sr_ratios,
         max_drop_path_rate,
-        patch_sizes,
-        strides,
+        layerwise_patch_sizes,
+        layerwise_strides,
         image_shape=(None, None, 3),
         hidden_dims=None,
         **kwargs,
@@ -32,12 +43,12 @@ class MiTBackbone(FeaturePyramidBackbone):
             https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/classification/mix_transformer)
         Args:
-            depths: The number of transformer encoders to be used per layer in the
+            layerwise_depths: The number of transformer encoders to be used per layer in the
                 network.
             num_layers: int. The number of Transformer layers.
-            blockwise_num_heads: list of integers, the number of heads to use
+            layerwise_num_heads: list of integers, the number of heads to use
             in the attention computation for each layer.
-            blockwise_sr_ratios: list of integers, the sequence reduction
+            layerwise_sr_ratios: list of integers, the sequence reduction
                 ratio to perform for each layer on the sequence before key and
                 value projections. If set to > 1, a `Conv2D` layer is used to
                 reduce the length of the sequence.
@@ -71,7 +82,10 @@ class MiTBackbone(FeaturePyramidBackbone):
         model.fit(images, labels, epochs=3)
         ```
         """
-        dpr = [x for x in np.linspace(0.0, max_drop_path_rate, sum(depths))]
+        dpr = [
+            x
+            for x in np.linspace(0.0, max_drop_path_rate, sum(layerwise_depths))
+        ]
         # === Layers ===
         cur = 0
@@ -82,8 +96,8 @@ class MiTBackbone(FeaturePyramidBackbone):
         for i in range(num_layers):
             patch_embed_layer = OverlappingPatchingAndEmbedding(
                 project_dim=hidden_dims[i],
-                patch_size=patch_sizes[i],
-                stride=strides[i],
+                patch_size=layerwise_patch_sizes[i],
+                stride=layerwise_strides[i],
                 name=f"patch_and_embed_{i}",
             )
             patch_embedding_layers.append(patch_embed_layer)
@@ -91,16 +105,16 @@ class MiTBackbone(FeaturePyramidBackbone):
             transformer_block = [
                 HierarchicalTransformerEncoder(
                     project_dim=hidden_dims[i],
-                    num_heads=blockwise_num_heads[i],
-                    sr_ratio=blockwise_sr_ratios[i],
+                    num_heads=layerwise_num_heads[i],
+                    sr_ratio=layerwise_sr_ratios[i],
                     drop_prob=dpr[cur + k],
                     name=f"hierarchical_encoder_{i}_{k}",
                 )
-                for k in range(depths[i])
+                for k in range(layerwise_depths[i])
             ]
             transformer_blocks.append(transformer_block)
-            cur += depths[i]
-            layer_norms.append(keras.layers.LayerNormalization())
+            cur += layerwise_depths[i]
+            layer_norms.append(keras.layers.LayerNormalization(epsilon=1e-5))
         # === Functional Model ===
         image_input = keras.layers.Input(shape=image_shape)
@@ -109,7 +123,7 @@ class MiTBackbone(FeaturePyramidBackbone):
         for i in range(num_layers):
             # Compute new height/width after the `proj`
             # call in `OverlappingPatchingAndEmbedding`
-            stride = strides[i]
+            stride = layerwise_strides[i]
             new_height, new_width = (
                 int(ops.shape(x)[1] / stride),
                 int(ops.shape(x)[2] / stride),
@@ -127,30 +141,30 @@ class MiTBackbone(FeaturePyramidBackbone):
         super().__init__(inputs=image_input, outputs=x, **kwargs)
         # === Config ===
-        self.depths = depths
+        self.layerwise_depths = layerwise_depths
         self.image_shape = image_shape
         self.hidden_dims = hidden_dims
         self.pyramid_outputs = pyramid_outputs
         self.num_layers = num_layers
-        self.blockwise_num_heads = blockwise_num_heads
-        self.blockwise_sr_ratios = blockwise_sr_ratios
+        self.layerwise_num_heads = layerwise_num_heads
+        self.layerwise_sr_ratios = layerwise_sr_ratios
         self.max_drop_path_rate = max_drop_path_rate
-        self.patch_sizes = patch_sizes
-        self.strides = strides
+        self.layerwise_patch_sizes = layerwise_patch_sizes
+        self.layerwise_strides = layerwise_strides
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "depths": self.depths,
+                "layerwise_depths": self.layerwise_depths,
                 "hidden_dims": self.hidden_dims,
                 "image_shape": self.image_shape,
                 "num_layers": self.num_layers,
-                "blockwise_num_heads": self.blockwise_num_heads,
-                "blockwise_sr_ratios": self.blockwise_sr_ratios,
+                "layerwise_num_heads": self.layerwise_num_heads,
+                "layerwise_sr_ratios": self.layerwise_sr_ratios,
                 "max_drop_path_rate": self.max_drop_path_rate,
-                "patch_sizes": self.patch_sizes,
-                "strides": self.strides,
+                "layerwise_patch_sizes": self.layerwise_patch_sizes,
+                "layerwise_strides": self.layerwise_strides,
             }
         )
         return config

keras_hub/src/models/mit/mit_layers.py CHANGED Viewed

@@ -183,20 +183,21 @@ class SegFormerMultiheadAttention(keras.layers.Layer):
         self.k = keras.layers.Dense(project_dim)
         self.v = keras.layers.Dense(project_dim)
         self.proj = keras.layers.Dense(project_dim)
+        self.dropout = keras.layers.Dropout(0.1)
+        self.proj_drop = keras.layers.Dropout(0.1)
         if sr_ratio > 1:
             self.sr = keras.layers.Conv2D(
                 filters=project_dim,
                 kernel_size=sr_ratio,
                 strides=sr_ratio,
-                padding="same",
             )
-            self.norm = keras.layers.LayerNormalization()
+            self.norm = keras.layers.LayerNormalization(epsilon=1e-5)
     def call(self, x):
         input_shape = ops.shape(x)
         H, W = int(math.sqrt(input_shape[1])), int(math.sqrt(input_shape[1]))
-        B, C = input_shape[0], input_shape[2]
+        B, N, C = input_shape[0], input_shape[1], input_shape[2]
         q = self.q(x)
         q = ops.reshape(
@@ -212,12 +213,11 @@ class SegFormerMultiheadAttention(keras.layers.Layer):
         if self.sr_ratio > 1:
             x = ops.reshape(
-                ops.transpose(x, [0, 2, 1]),
+                x,
                 (B, H, W, C),
             )
             x = self.sr(x)
-            x = ops.reshape(x, [input_shape[0], input_shape[2], -1])
-            x = ops.transpose(x, [0, 2, 1])
+            x = ops.reshape(x, [B, -1, C])
             x = self.norm(x)
         k = self.k(x)
@@ -241,14 +241,16 @@ class SegFormerMultiheadAttention(keras.layers.Layer):
         attn = (q @ ops.transpose(k, [0, 1, 3, 2])) * self.scale
         attn = ops.nn.softmax(attn, axis=-1)
+        attn = self.dropout(attn)
         attn = attn @ v
         attn = ops.reshape(
             ops.transpose(attn, [0, 2, 1, 3]),
-            [input_shape[0], input_shape[1], input_shape[2]],
+            [B, N, C],
         )
         x = self.proj(attn)
+        x = self.proj_drop(x)
         return x

keras_hub/src/models/mit/mit_presets.py CHANGED Viewed

@@ -18,10 +18,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 8 transformer blocks."
             ),
             "params": 3321962,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b0_ade20k_512/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b0_ade20k_512/2",
     },
     "mit_b1_ade20k_512": {
         "metadata": {
@@ -29,10 +28,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 8 transformer blocks."
             ),
             "params": 13156554,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b1_ade20k_512/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b1_ade20k_512/2",
     },
     "mit_b2_ade20k_512": {
         "metadata": {
@@ -40,10 +38,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 16 transformer blocks."
             ),
             "params": 24201418,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b2_ade20k_512/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b2_ade20k_512/2",
     },
     "mit_b3_ade20k_512": {
         "metadata": {
@@ -51,10 +48,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 28 transformer blocks."
             ),
             "params": 44077258,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b3_ade20k_512/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b3_ade20k_512/2",
     },
     "mit_b4_ade20k_512": {
         "metadata": {
@@ -62,10 +58,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 41 transformer blocks."
             ),
             "params": 60847818,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b4_ade20k_512/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b4_ade20k_512/2",
     },
     "mit_b5_ade20k_640": {
         "metadata": {
@@ -73,10 +68,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 52 transformer blocks."
             ),
             "params": 81448138,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_ade20k_512/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_ade20k_640/2",
     },
     "mit_b0_cityscapes_1024": {
         "metadata": {
@@ -84,10 +78,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 8 transformer blocks."
             ),
             "params": 3321962,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b0_cityscapes_1024/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b0_cityscapes_1024/2",
     },
     "mit_b1_cityscapes_1024": {
         "metadata": {
@@ -95,10 +88,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 8 transformer blocks."
             ),
             "params": 13156554,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b1_cityscapes_1024/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b1_cityscapes_1024/2",
     },
     "mit_b2_cityscapes_1024": {
         "metadata": {
@@ -106,10 +98,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 16 transformer blocks."
             ),
             "params": 24201418,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b2_cityscapes_1024/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b2_cityscapes_1024/2",
     },
     "mit_b3_cityscapes_1024": {
         "metadata": {
@@ -117,10 +108,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 28 transformer blocks."
             ),
             "params": 44077258,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b3_cityscapes_1024/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b3_cityscapes_1024/2",
     },
     "mit_b4_cityscapes_1024": {
         "metadata": {
@@ -128,10 +118,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 41 transformer blocks."
             ),
             "params": 60847818,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b4_cityscapes_1024/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b4_cityscapes_1024/2",
     },
     "mit_b5_cityscapes_1024": {
         "metadata": {
@@ -139,10 +128,9 @@ backbone_presets_with_weights = {
                 "MiT (MixTransformer) model with 52 transformer blocks."
             ),
             "params": 81448138,
-            "official_name": "MiT",
             "path": "mit",
         },
-        "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_cityscapes_1024/1",
+        "kaggle_handle": "kaggle://keras/mit/keras/mit_b5_cityscapes_1024/2",
     },
 }

keras_hub/src/models/opt/opt_presets.py CHANGED Viewed

@@ -9,9 +9,7 @@ backbone_presets = {
                 "BookCorpus, CommonCrawl, Pile, and PushShift.io corpora."
             ),
             "params": 125237760,
-            "official_name": "OPT",
             "path": "opt",
-            "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md",
         },
         "kaggle_handle": "kaggle://keras/opt/keras/opt_125m_en/2",
     },
@@ -24,9 +22,7 @@ backbone_presets = {
                 "BookCorpus, CommonCrawl, Pile, and PushShift.io corpora."
             ),
             "params": 1315753984,
-            "official_name": "OPT",
             "path": "opt",
-            "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md",
         },
         "kaggle_handle": "kaggle://keras/opt/keras/opt_1.3b_en/2",
     },
@@ -37,9 +33,7 @@ backbone_presets = {
                 "BookCorpus, CommonCrawl, Pile, and PushShift.io corpora."
             ),
             "params": 2700000000,
-            "official_name": "OPT",
             "path": "opt",
-            "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md",
         },
         "kaggle_handle": "kaggle://keras/opt/keras/opt_2.7b_en/2",
     },
@@ -50,9 +44,7 @@ backbone_presets = {
                 "BookCorpus, CommonCrawl, Pile, and PushShift.io corpora."
             ),
             "params": 6700000000,
-            "official_name": "OPT",
             "path": "opt",
-            "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md",
         },
         "kaggle_handle": "kaggle://keras/opt/keras/opt_6.7b_en/2",
     },

keras_hub/src/models/pali_gemma/pali_gemma_backbone.py CHANGED Viewed

@@ -48,22 +48,40 @@ class PaliGemmaBackbone(Backbone):
             a two-layer feedforward network for each transformer decoder block.
         head_dim: int. The size of each attention head in the mixed decoder.
         vit_patch_size: int. The size of each square patch in the input image.
-        vit_num_heads: int. The number of attention heads for the vision(image)
+        vit_num_heads: int. The number of attention heads for the vision (image)
             transformer encoder.
         vit_hidden_dim: int. The size of the transformer hidden state at the end
             of each vision transformer layer.
         vit_num_layers: int. The number of vision transformer layers.
         vit_intermediate_dim: int. The output dimension of the first Dense layer
-            in a two-layer feedforward network for vision transformer.
-        vit_pooling: string. The encoded vision embeddings are pooled using the
-            specified polling setting. The accepted values are `"map"`, `"gap"`,
-            `"0"` or `"none"`. Defaults to `"none"`.
+            in a two-layer feedforward network for vision transformer. Defaults
+            to `4304`.
+        vit_pooling: `None` or string. The encoded vision embeddings are pooled
+            using the specified polling setting. The accepted values are
+            `"map"`, `"gap"`, `"0"` or `None`. Defaults to `None`.
         vit_classifier_activation: activation function. The activation that
             is used for final output classification in the vision transformer.
+            Defaults to `None`.
         vit_name: string. The name used for vision transformer layers.
+        query_head_dim_normalize: boolean. If `True` normalize the query before
+            attention with `head_dim`. If `False`, normalize the query with
+            `hidden_dim / num_query_heads`. Defaults to `True`.
+        use_post_ffw_norm: boolean. Whether to normalize after the feedforward
+            block. Defaults to `False`.
+        use_post_attention_norm: boolean. Whether to normalize after the attention
+            block. Defaults to `False`.
+        attention_logit_soft_cap: `None` or int. Soft cap for the attention
+            logits. Defaults to `None`.
+        final_logit_soft_cap: `None` or int. Soft cap for the final logits.
+            Defaults to `None`.
+        use_sliding_window_attention: boolean. Whether to use sliding local
+          window attention. Defaults to `False`.
+        sliding_window_size: int. Size of the sliding local window. Defaults to
+            `4096`.
         layer_norm_epsilon: float. The epsilon value user for every layer norm
-            in all transformer blocks.
+            in all transformer blocks. Defaults to `1e-6`.
         dropout: float. Dropout probability for the Transformer decoder blocks.
+            Defaults to `0`.
         dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
@@ -119,6 +137,13 @@ class PaliGemmaBackbone(Backbone):
         vit_pooling=None,
         vit_classifier_activation=None,
         vit_name=None,
+        query_head_dim_normalize=True,
+        use_post_ffw_norm=False,
+        use_post_attention_norm=False,
+        attention_logit_soft_cap=None,
+        final_logit_soft_cap=None,
+        use_sliding_window_attention=False,
+        sliding_window_size=4096,
         layer_norm_epsilon=1e-6,
         dropout=0,
         dtype=None,
@@ -136,6 +161,7 @@ class PaliGemmaBackbone(Backbone):
                 seed=None,
             ),
             dtype=dtype,
+            logit_soft_cap=final_logit_soft_cap,
             name="token_embedding",
         )
         # TODO Remove this. Work around for previous serialization bug.
@@ -155,12 +181,19 @@ class PaliGemmaBackbone(Backbone):
         )
         self.transformer_layers = []
         for i in range(num_layers):
+            sliding_window = use_sliding_window_attention and (i % 2 == 0)
             layer = PaliGemmaDecoderBlock(
                 hidden_dim=hidden_dim,
                 intermediate_dim=intermediate_dim,
-                num_query_heads=num_query_heads,
                 head_dim=head_dim,
+                num_query_heads=num_query_heads,
                 num_key_value_heads=num_key_value_heads,
+                query_head_dim_normalize=query_head_dim_normalize,
+                use_post_ffw_norm=use_post_ffw_norm,
+                use_post_attention_norm=use_post_attention_norm,
+                logit_soft_cap=attention_logit_soft_cap,
+                use_sliding_window_attention=sliding_window,
+                sliding_window_size=sliding_window_size,
                 dropout=dropout,
                 dtype=dtype,
                 name=f"decoder_block_{i}",
@@ -173,7 +206,9 @@ class PaliGemmaBackbone(Backbone):
         )
         # === Functional Model ===
-        image_input = self.vit_encoder.inputs[0]
+        image_input = keras.Input(
+            shape=(image_size, image_size, 3), name="images"
+        )
         token_id_input = keras.Input(
             shape=(None,), dtype="int32", name="token_ids"
         )
@@ -219,7 +254,15 @@ class PaliGemmaBackbone(Backbone):
         self.head_dim = head_dim
         self.layer_norm_epsilon = layer_norm_epsilon
         self.dropout = dropout
-        # VIT Params
+        # Gemma2 params
+        self.query_head_dim_normalize = query_head_dim_normalize
+        self.use_post_ffw_norm = use_post_ffw_norm
+        self.use_post_attention_norm = use_post_attention_norm
+        self.attention_logit_soft_cap = attention_logit_soft_cap
+        self.final_logit_soft_cap = final_logit_soft_cap
+        self.sliding_window_size = sliding_window_size
+        self.use_sliding_window_attention = use_sliding_window_attention
+        # ViT params
         self.vit_patch_size = vit_patch_size
         self.vit_num_heads = vit_num_heads
         self.vit_hidden_dim = vit_hidden_dim
@@ -243,8 +286,6 @@ class PaliGemmaBackbone(Backbone):
                 "hidden_dim": self.hidden_dim,
                 "intermediate_dim": self.intermediate_dim,
                 "head_dim": self.head_dim,
-                "layer_norm_epsilon": self.layer_norm_epsilon,
-                "dropout": self.dropout,
                 "vit_patch_size": self.vit_patch_size,
                 "vit_num_heads": self.vit_num_heads,
                 "vit_hidden_dim": self.vit_hidden_dim,
@@ -253,6 +294,15 @@ class PaliGemmaBackbone(Backbone):
                 "vit_pooling": self.vit_pooling,
                 "vit_classifier_activation": self.vit_classifier_activation,
                 "vit_name": self.vit_name,
+                "query_head_dim_normalize": self.query_head_dim_normalize,
+                "use_post_ffw_norm": self.use_post_ffw_norm,
+                "use_post_attention_norm": self.use_post_attention_norm,
+                "final_logit_soft_cap": self.final_logit_soft_cap,
+                "attention_logit_soft_cap": self.attention_logit_soft_cap,
+                "sliding_window_size": self.sliding_window_size,
+                "use_sliding_window_attention": self.use_sliding_window_attention,
+                "layer_norm_epsilon": self.layer_norm_epsilon,
+                "dropout": self.dropout,
             }
         )
         return config

keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py CHANGED Viewed

@@ -31,33 +31,25 @@ class PaliGemmaDecoderBlock(GemmaDecoderBlock):
             the attention layer.
         num_key_value_heads: int. The number of heads for the key and value
             projections in the attention layer.
+        query_head_dim_normalize: boolean. If `True` normalize the query before
+            attention with `head_dim`. If `False`, normalize the query with
+            `hidden_dim / num_query_heads`. Defaults to `True`.
+        use_post_ffw_norm: boolean. Whether to normalize after the feedforward
+            block. Defaults to `False`.
+        use_post_attention_norm: boolean. Whether to normalize after the
+            attention block. Defaults to `False`.
+        logit_soft_cap: `None` or int. Soft cap for the attention logits.
+            Defaults to `None`.
+        use_sliding_window_attention: boolean. Whether to use sliding local
+          window attention. Defaults to `False`.
+        sliding_window_size: int. Size of the sliding local window. Defaults to
+            `4096`.
         layer_norm_epsilon: float. The epsilon hyperparameter used for layer
-            normalization.
+            normalization. Defaults to `1e-6`.
         dropout: float. The dropout rate for the transformer attention layer.
+            Defaults to `0`.
     """
-    def __init__(
-        self,
-        hidden_dim,
-        intermediate_dim,
-        head_dim,
-        num_query_heads,
-        num_key_value_heads,
-        layer_norm_epsilon=1e-6,
-        dropout=0,
-        **kwargs,
-    ):
-        super().__init__(
-            hidden_dim=hidden_dim,
-            intermediate_dim=intermediate_dim,
-            head_dim=head_dim,
-            num_query_heads=num_query_heads,
-            num_key_value_heads=num_key_value_heads,
-            layer_norm_epsilon=layer_norm_epsilon,
-            dropout=dropout,
-            **kwargs,
-        )
     def call(
         self,
         x,
@@ -83,6 +75,9 @@ class PaliGemmaDecoderBlock(GemmaDecoderBlock):
                 attention_mask=attention_mask,
             )
+        if self.use_post_attention_norm:
+            attention = self.post_attention_norm(attention)
         if self.dropout:
             attention = self.attention_dropout(attention)
@@ -94,6 +89,9 @@ class PaliGemmaDecoderBlock(GemmaDecoderBlock):
         x = keras.activations.gelu(x1, approximate=True) * x2
         x = self.ffw_linear(x)
+        if self.use_post_ffw_norm:
+            x = self.post_ffw_norm(x)
         x = x + attention_x
         if cache is not None:

keras-hub-nightly 0.16.1.dev202410200345__py3-none-any.whl → 0.19.0.dev202412070351__py3-none-any.whl

keras-hub-nightly 0.16.1.dev202410200345py3-none-any.whl → 0.19.0.dev202412070351py3-none-any.whl