PyPI - nmn - Versions diffs - 0.1.6__tar.gz → 0.1.7__tar.gz - Mend

nmn 0.1.6tar.gz → 0.1.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{nmn-0.1.6 → nmn-0.1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nmn
-Version: 0.1.6
+Version: 0.1.7
 Summary: a neuron that matter
 Project-URL: Homepage, https://github.com/mlnomadpy/nmn
 Project-URL: Bug Tracker, https://github.com/mlnomadpy/my_package/issues

{nmn-0.1.6 → nmn-0.1.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "nmn"
-version = "0.1.6"
+version = "0.1.7"
 authors = [
     { name="Taha Bouhsine", email="yat@mlnomads.com" },
 ]

{nmn-0.1.6 → nmn-0.1.7}/src/nmn/nnx/yatattention.py RENAMED Viewed

@@ -29,6 +29,7 @@ from flax.typing import (
 from nmn.nnx.nmn import YatNMN
 from jax import Array
+from nmn.nnx.squashers import softermax
 def yat_attention_weights(
   query: Array,
   key: Array,
@@ -42,6 +43,7 @@ def yat_attention_weights(
   precision: PrecisionLike = None,
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
+  use_softermax: bool = False,
 ):
   """Computes attention weights using YatNMN distance-based calculation."""
   query, key = promote_dtype((query, key), dtype=dtype)
@@ -86,7 +88,10 @@ def yat_attention_weights(
     attn_weights = jnp.where(mask, attn_weights, big_neg)
   # normalize the attention weights
-  attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+  if use_softermax:
+    attn_weights = softermax(attn_weights).astype(dtype)
+  else:
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
   if module:
     module.sow(nnx.Intermediate, 'attention_weights', attn_weights)
@@ -120,6 +125,7 @@ def yat_attention(
   precision: PrecisionLike = None,
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
+  use_softermax: bool = False,
 ):
   """Computes attention using YatNMN distance-based calculation."""
   query, key, value = promote_dtype((query, key, value), dtype=dtype)
@@ -147,6 +153,7 @@ def yat_attention(
     precision,
     module,
     epsilon,
+    use_softermax,
   )
   # return weighted sum over values for each query position
@@ -362,6 +369,7 @@ class MultiHeadAttention(Module):
     out_dot_general_cls: Any = None,
     rngs: rnglib.Rngs,
     epsilon: float = 1e-5,
+    use_softermax: bool = False,
   ):
     self.num_heads = num_heads
     self.in_features = in_features
@@ -390,6 +398,7 @@ class MultiHeadAttention(Module):
     self.qkv_dot_general_cls = qkv_dot_general_cls
     self.out_dot_general_cls = out_dot_general_cls
     self.epsilon = epsilon
+    self.use_softermax = use_softermax
     self.use_alpha = use_alpha
     self.alpha_init = alpha_init
     self.use_dropconnect = use_dropconnect
@@ -621,6 +630,7 @@ class MultiHeadAttention(Module):
       precision=self.precision,
       module=self if sow_weights else None,
       epsilon=self.epsilon,  # Pass epsilon to yat_attention
+      use_softermax=self.use_softermax,
     )
     # Reshape attention output back to original embedding dimension
     # from [batch..., length, num_heads, head_dim] to [batch..., length, qkv_features]