PyPI - keras-hub-nightly - Versions diffs - 0.21.0.dev202505200408__py3-none-any.whl → 0.21.0.dev202505220409__py3-none-any.whl - Mend

keras-hub-nightly 0.21.0.dev202505200408py3-none-any.whl → 0.21.0.dev202505220409py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

keras_hub/src/models/mixtral/mixtral_attention.py CHANGED Viewed

@@ -27,19 +27,19 @@ class CachedMixtralAttention(keras.layers.Layer):
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self._num_query_heads = num_query_heads
-        self._num_key_value_heads = num_key_value_heads
-        self._sliding_window = sliding_window
-        self._dropout = dropout
+        self.num_query_heads = num_query_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.sliding_window = sliding_window
+        self.dropout = dropout
-        self._num_key_value_groups = num_query_heads // num_key_value_heads
-        self._rope_max_wavelength = rope_max_wavelength
+        self.num_key_value_groups = num_query_heads // num_key_value_heads
+        self.rope_max_wavelength = rope_max_wavelength
         self._kernel_initializer = keras.initializers.get(
             clone_initializer(kernel_initializer)
         )
-        self._rope_scaling_factor = rope_scaling_factor
+        self.rope_scaling_factor = rope_scaling_factor
     def build(self, inputs_shape):
         # Einsum variables:
@@ -51,12 +51,12 @@ class CachedMixtralAttention(keras.layers.Layer):
         # v = num key/value heads
         # h = head dim
         self._hidden_dim = inputs_shape[-1]
-        self._head_dim = self._hidden_dim // self._num_query_heads
+        self._head_dim = self._hidden_dim // self.num_query_heads
         self._inv_norm_factor = 1.0 / math.sqrt(self._head_dim)
         self.query_dense = keras.layers.EinsumDense(
             equation="bqm,muh->bquh",
-            output_shape=(None, self._num_query_heads, self._head_dim),
+            output_shape=(None, self.num_query_heads, self._head_dim),
             kernel_initializer=self._kernel_initializer,
             dtype=self.dtype_policy,
             name="query",
@@ -67,7 +67,7 @@ class CachedMixtralAttention(keras.layers.Layer):
             equation="bkm,mvh->bkvh",
             output_shape=(
                 None,
-                self._num_key_value_heads,
+                self.num_key_value_heads,
                 self._head_dim,
             ),
             kernel_initializer=self._kernel_initializer,
@@ -80,7 +80,7 @@ class CachedMixtralAttention(keras.layers.Layer):
             equation="bkm,mvh->bkvh",
             output_shape=(
                 None,
-                self._num_key_value_heads,
+                self.num_key_value_heads,
                 self._head_dim,
             ),
             kernel_initializer=self._kernel_initializer,
@@ -89,31 +89,31 @@ class CachedMixtralAttention(keras.layers.Layer):
         )
         self.value_dense.build(inputs_shape)
-        self._softmax = keras.layers.Softmax(
+        self.softmax = keras.layers.Softmax(
             axis=-1,
             dtype="float32",
             name="attention_softmax",
         )
-        self._dropout_layer = keras.layers.Dropout(
-            rate=self._dropout,
+        self.dropout_layer = keras.layers.Dropout(
+            rate=self.dropout,
             dtype=self.dtype_policy,
         )
-        self._output_dense = keras.layers.EinsumDense(
+        self.output_dense = keras.layers.EinsumDense(
             equation="bquh,uhm->bqm",
             output_shape=(None, self._hidden_dim),
             kernel_initializer=self._kernel_initializer,
             dtype=self.dtype_policy,
             name="attention_output",
         )
-        self._output_dense.build(
-            (None, None, self._num_query_heads, self._head_dim)
+        self.output_dense.build(
+            (None, None, self.num_query_heads, self._head_dim)
         )
         self.rotary_embedding_layer = RotaryEmbedding(
-            max_wavelength=self._rope_max_wavelength,
-            scaling_factor=self._rope_scaling_factor,
+            max_wavelength=self.rope_max_wavelength,
+            scaling_factor=self.rope_scaling_factor,
             dtype=self.dtype_policy,
         )
@@ -168,18 +168,18 @@ class CachedMixtralAttention(keras.layers.Layer):
         # [batch_shape, seq_len, num_key_value_heads, head_dim]
         # -> [batch_shape, seq_len, num_heads, head_dim]
-        key = ops.repeat(key, repeats=self._num_key_value_groups, axis=2)
-        value = ops.repeat(value, repeats=self._num_key_value_groups, axis=2)
+        key = ops.repeat(key, repeats=self.num_key_value_groups, axis=2)
+        value = ops.repeat(value, repeats=self.num_key_value_groups, axis=2)
         attention_output = self._compute_attention(
             query, key, value, attention_mask
         )
-        attention_output = self._dropout_layer(
+        attention_output = self.dropout_layer(
             attention_output, training=training
         )
-        attention_output = self._output_dense(attention_output)
+        attention_output = self.output_dense(attention_output)
         if cache is not None:
             return attention_output, cache
@@ -187,10 +187,8 @@ class CachedMixtralAttention(keras.layers.Layer):
     def _masked_softmax(self, attention_scores, attention_mask=None):
         if attention_mask is not None:
-            return self._softmax(
-                attention_scores, attention_mask[:, None, :, :]
-            )
-        return self._softmax(attention_scores)
+            return self.softmax(attention_scores, attention_mask[:, None, :, :])
+        return self.softmax(attention_scores)
     def _use_fused_attention_op(self):
         if not fused_attention_op_available():
@@ -198,9 +196,6 @@ class CachedMixtralAttention(keras.layers.Layer):
         if self.dropout > 0.0:
             return False
         if running_on_gpu():
-            # GPU never supports softcap in the fused op.
-            if self.logit_soft_cap is not None:
-                return False
             return gpu_supports_fused_attention_op()
         elif running_on_tpu():
             # TPU supports softcap with on keras >= 3.10.
@@ -215,18 +210,12 @@ class CachedMixtralAttention(keras.layers.Layer):
                 attention_mask = ops.expand_dims(attention_mask, axis=1)
                 attention_mask = ops.cast(attention_mask, dtype="bool")
-            if self.logit_soft_cap:
-                kwargs = {"attn_logits_soft_cap": self.logit_soft_cap}
-            else:
-                kwargs = {}
             attention_output = ops.dot_product_attention(
                 query,
                 key,
                 value,
                 mask=attention_mask,
                 scale=self._inv_norm_factor,
-                **kwargs,
             )
             return attention_output
@@ -249,15 +238,15 @@ class CachedMixtralAttention(keras.layers.Layer):
         config = super().get_config()
         config.update(
             {
-                "num_query_heads": self._num_query_heads,
-                "num_key_value_heads": self._num_key_value_heads,
-                "rope_max_wavelength": self._rope_max_wavelength,
-                "rope_scaling_factor": self._rope_scaling_factor,
+                "num_query_heads": self.num_query_heads,
+                "num_key_value_heads": self.num_key_value_heads,
+                "rope_max_wavelength": self.rope_max_wavelength,
+                "rope_scaling_factor": self.rope_scaling_factor,
                 "kernel_initializer": keras.initializers.serialize(
                     self._kernel_initializer
                 ),
-                "sliding_window": self._sliding_window,
-                "dropout": self._dropout,
+                "sliding_window": self.sliding_window,
+                "dropout": self.dropout,
             }
         )
         return config

keras_hub/src/models/qwen_moe/qwen_moe_attention.py CHANGED Viewed

@@ -256,9 +256,6 @@ class QwenMoeAttention(keras.layers.Layer):
         if self.dropout > 0.0:
             return False
         if running_on_gpu():
-            # GPU never supports softcap in the fused op.
-            if self.logit_soft_cap is not None:
-                return False
             return gpu_supports_fused_attention_op()
         elif running_on_tpu():
             # TPU supports softcap with on keras >= 3.10.
@@ -268,7 +265,13 @@ class QwenMoeAttention(keras.layers.Layer):
             return False
     def _compute_attention(
-        self, query, key, value, attention_mask=None, cache_update_index=None
+        self,
+        query,
+        key,
+        value,
+        attention_mask=None,
+        cache_update_index=None,
+        **kwargs,
     ):
         """Computes attention using query, key, and value tensors.
@@ -289,11 +292,6 @@ class QwenMoeAttention(keras.layers.Layer):
                 attention_mask = ops.expand_dims(attention_mask, axis=1)
                 attention_mask = ops.cast(attention_mask, dtype="bool")
-            if self.logit_soft_cap:
-                kwargs = {"attn_logits_soft_cap": self.logit_soft_cap}
-            else:
-                kwargs = {}
             attention_output = ops.dot_product_attention(
                 query,
                 key,

keras_hub/src/utils/transformers/convert_mixtral.py CHANGED Viewed

@@ -68,7 +68,7 @@ def convert_weights(backbone, loader, transformers_config):
         )
         ## Output
         loader.port_weight(
-            keras_variable=decoder_layer._self_attention_layer._output_dense.kernel,
+            keras_variable=decoder_layer._self_attention_layer.output_dense.kernel,
             hf_weight_key=f"model.layers.{i}.self_attn.o_proj.weight",
             hook_fn=transpose_and_reshape,
         )

keras_hub/src/version.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from keras_hub.src.api_export import keras_hub_export
 # Unique source of truth for the version number.
-__version__ = "0.21.0.dev202505200408"
+__version__ = "0.21.0.dev202505220409"
 @keras_hub_export("keras_hub.version")

{keras_hub_nightly-0.21.0.dev202505200408.dist-info → keras_hub_nightly-0.21.0.dev202505220409.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: keras-hub-nightly
-Version: 0.21.0.dev202505200408
+Version: 0.21.0.dev202505220409
 Summary: Pretrained models for Keras.
 Author-email: Keras team <keras-users@googlegroups.com>
 License-Expression: Apache-2.0

{keras_hub_nightly-0.21.0.dev202505200408.dist-info → keras_hub_nightly-0.21.0.dev202505220409.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ keras_hub/models/__init__.py,sha256=itSzodVUeuX6HQnmsSXY0Wv-5Htbu397410R-SFW_4I,
 keras_hub/samplers/__init__.py,sha256=aFQIkiqbZpi8vjrPp2MVII4QUfE-eQjra5fMeHsoy7k,886
 keras_hub/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 keras_hub/src/api_export.py,sha256=9pQZK27JObxWZ96QPLBp1OBsjWigh1iuV6RglPGMRk0,1499
-keras_hub/src/version.py,sha256=16Dlkn6itEVzBVI5lZYq-aU9ij4T2GIfi9kAgEFRhGc,222
+keras_hub/src/version.py,sha256=ZWHai9U-yJxL-dj1yBgjl16y6XtOeP2SreCCjSf9xgA,222
 keras_hub/src/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 keras_hub/src/layers/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 keras_hub/src/layers/modeling/alibi_bias.py,sha256=1XBTHI52L_iJDhN_w5ydu_iMhCuTgQAxEPwcLA6BPuk,4411
@@ -250,7 +250,7 @@ keras_hub/src/models/mit/mit_image_classifier_preprocessor.py,sha256=oNYs-pUK8Vn
 keras_hub/src/models/mit/mit_image_converter.py,sha256=Mw7nV-OzyBveGuZUNFsPPKyq9jXJVW2_cVH024CNkXM,311
 keras_hub/src/models/mit/mit_layers.py,sha256=HUJO5uhJ6jgwANpwbQdPlEVwLRVb3BZQ-Ftjg3B9XvY,9734
 keras_hub/src/models/mit/mit_presets.py,sha256=ooLrh2OoGZKxnCGnhB6BynYJtVCXH7nDDFhgQRWt36U,4528
-keras_hub/src/models/mixtral/mixtral_attention.py,sha256=rdUBjIFQZKBpyCXlXMDgmB8gLCk0ngnhdhNs_twFE_c,9089
+keras_hub/src/models/mixtral/mixtral_attention.py,sha256=f5aiTtstWeKG_ZwumAlYIzjIN08CpnxNdenxWNJSwZw,8713
 keras_hub/src/models/mixtral/mixtral_backbone.py,sha256=vUAFXvqwVBgKxYbOsqIHzPN59bhaDrGWwOnBCzeUtt0,8034
 keras_hub/src/models/mixtral/mixtral_causal_lm.py,sha256=JA1t6xTeaYX_fNo9ftRyvzdRDG3vndC-Rlwn5fnsbQo,12001
 keras_hub/src/models/mixtral/mixtral_causal_lm_preprocessor.py,sha256=q2qXa9QAUWBvOWv9DeNvwsBNXSORJAbQFoQsWQ7e8V8,3079
@@ -311,7 +311,7 @@ keras_hub/src/models/qwen/qwen_layernorm.py,sha256=DS35r3qd6g5ocL7Nhf_vNzLLMo1aI
 keras_hub/src/models/qwen/qwen_presets.py,sha256=_jRG7bB4yBGWteBLbK2elc1e9doRl8zdzQRZgxFvnfc,1988
 keras_hub/src/models/qwen/qwen_tokenizer.py,sha256=LCv3IyiDDHqVnM9N3lf5-BE3iwicIh0nKS1hjoPw9lE,1532
 keras_hub/src/models/qwen_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-keras_hub/src/models/qwen_moe/qwen_moe_attention.py,sha256=mXc4uGkUSK3FHdJ5_77xiX7Gm0eO1GWTF40ei_68pvU,13472
+keras_hub/src/models/qwen_moe/qwen_moe_attention.py,sha256=pE79_iHUm2LGkoWL6zMJw_pNfzIvmyq3yJaiq47W2TY,13242
 keras_hub/src/models/qwen_moe/qwen_moe_backbone.py,sha256=nrfELvIvRLmrgKrUNXci2CrecmeI6bWzJj7HH-RcWJA,15341
 keras_hub/src/models/qwen_moe/qwen_moe_causal_lm.py,sha256=MeP60v7GcN_SmH5_ULRpqgmFVgaYAosSecZiSQVlJvU,13256
 keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor.py,sha256=uKaXRrJs02vkVudjdehzJPp0B84tPMkxNHlp166kceE,589
@@ -490,7 +490,7 @@ keras_hub/src/utils/transformers/convert_gemma.py,sha256=ElCgwBpSN5Q7rV5PJawTsoy
 keras_hub/src/utils/transformers/convert_gpt2.py,sha256=HCeHN_-GiQJRxLCM9OCJJ1watPVpIBF8ujS8pGbBOWc,5703
 keras_hub/src/utils/transformers/convert_llama3.py,sha256=c5phNl-QayQ_BS0s-lenbu6oHxqfwDShKJoh9DluxUU,6146
 keras_hub/src/utils/transformers/convert_mistral.py,sha256=kVhN9h1ZFVhwkNW8p3wnS7eANJUXIsNy1RxWXy20Gqw,4760
-keras_hub/src/utils/transformers/convert_mixtral.py,sha256=_esUzVRYABR5pHHSALqUieSuAeBg4te1JnlGQENqECU,5509
+keras_hub/src/utils/transformers/convert_mixtral.py,sha256=PxeCY8Xe7U_caICugwOCEjuSZ51ZUtmef6rUxh-Wt54,5508
 keras_hub/src/utils/transformers/convert_pali_gemma.py,sha256=B1leeDw96Yvu81hYumf66hIid07k5NLqoeWAJgPnaLs,10649
 keras_hub/src/utils/transformers/convert_qwen.py,sha256=WUxMAEFVqRs7TRw7QU5TH3_ev4yf02R1xFVliMvTQqg,5886
 keras_hub/src/utils/transformers/convert_qwen_moe.py,sha256=a7R28aln-PdAcNuKAXdrtzvslho2Co6GypChxLMKPpc,10618
@@ -499,7 +499,7 @@ keras_hub/src/utils/transformers/preset_loader.py,sha256=1nfS5xVsl-JROGXJXltTqV1
 keras_hub/src/utils/transformers/safetensor_utils.py,sha256=CYUHyA4y-B61r7NDnCsFb4t_UmSwZ1k9L-8gzEd6KRg,3339
 keras_hub/tokenizers/__init__.py,sha256=uMjjm0mzUkRb0e4Ac_JK8aJ9cKGUi5UqmzWoWAFJprE,4164
 keras_hub/utils/__init__.py,sha256=jXPqVGBpJr_PpYmqD8aDG-fRMlxH-ulqCR2SZMn288Y,646
-keras_hub_nightly-0.21.0.dev202505200408.dist-info/METADATA,sha256=1G-1i8uPTWmHiGqhTz4-2JahMqvxpwNCAe-hTtq5foo,7393
-keras_hub_nightly-0.21.0.dev202505200408.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-keras_hub_nightly-0.21.0.dev202505200408.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
-keras_hub_nightly-0.21.0.dev202505200408.dist-info/RECORD,,
+keras_hub_nightly-0.21.0.dev202505220409.dist-info/METADATA,sha256=EqRkCDIuHYBX4sLxSObub9YnmlNwhf_d2-IKG1tm4Xw,7393
+keras_hub_nightly-0.21.0.dev202505220409.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
+keras_hub_nightly-0.21.0.dev202505220409.dist-info/top_level.txt,sha256=N4J6piIWBKa38A4uV-CnIopnOEf8mHAbkNXafXm_CuA,10
+keras_hub_nightly-0.21.0.dev202505220409.dist-info/RECORD,,

{keras_hub_nightly-0.21.0.dev202505200408.dist-info → keras_hub_nightly-0.21.0.dev202505220409.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.7.1)
+Generator: setuptools (80.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{keras_hub_nightly-0.21.0.dev202505200408.dist-info → keras_hub_nightly-0.21.0.dev202505220409.dist-info}/top_level.txt RENAMED Viewed

File without changes

keras-hub-nightly 0.21.0.dev202505200408__py3-none-any.whl → 0.21.0.dev202505220409__py3-none-any.whl

keras-hub-nightly 0.21.0.dev202505200408py3-none-any.whl → 0.21.0.dev202505220409py3-none-any.whl