PyPI - nmn - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

nmn 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

nmn/nnx/TODO +2 -0
nmn/nnx/yatattention.py +11 -5
nmn/torch/conv.py +2105 -0
{nmn-0.1.6.dist-info → nmn-0.1.8.dist-info}/METADATA +1 -1
{nmn-0.1.6.dist-info → nmn-0.1.8.dist-info}/RECORD +7 -5
{nmn-0.1.6.dist-info → nmn-0.1.8.dist-info}/WHEEL +0 -0
{nmn-0.1.6.dist-info → nmn-0.1.8.dist-info}/licenses/LICENSE +0 -0

nmn/nnx/TODO ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ - add support to masked kernels
2	+ - explain attention [directed graph]

nmn/nnx/yatattention.py CHANGED Viewed

@@ -29,6 +29,7 @@ from flax.typing import (
 from nmn.nnx.nmn import YatNMN
 from jax import Array
+from nmn.nnx.squashers import softermax
 def yat_attention_weights(
   query: Array,
   key: Array,
@@ -42,6 +43,7 @@ def yat_attention_weights(
   precision: PrecisionLike = None,
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
+  use_softermax: bool = False,
 ):
   """Computes attention weights using YatNMN distance-based calculation."""
   query, key = promote_dtype((query, key), dtype=dtype)
@@ -86,7 +88,10 @@ def yat_attention_weights(
     attn_weights = jnp.where(mask, attn_weights, big_neg)
   # normalize the attention weights
-  attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+  if use_softermax:
+    attn_weights = softermax(attn_weights).astype(dtype)
+  else:
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
   if module:
     module.sow(nnx.Intermediate, 'attention_weights', attn_weights)
@@ -120,6 +125,7 @@ def yat_attention(
   precision: PrecisionLike = None,
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
+  use_softermax: bool = False,
 ):
   """Computes attention using YatNMN distance-based calculation."""
   query, key, value = promote_dtype((query, key, value), dtype=dtype)
@@ -147,6 +153,7 @@ def yat_attention(
     precision,
     module,
     epsilon,
+    use_softermax,
   )
   # return weighted sum over values for each query position
@@ -362,6 +369,7 @@ class MultiHeadAttention(Module):
     out_dot_general_cls: Any = None,
     rngs: rnglib.Rngs,
     epsilon: float = 1e-5,
+    use_softermax: bool = False,
   ):
     self.num_heads = num_heads
     self.in_features = in_features
@@ -390,6 +398,7 @@ class MultiHeadAttention(Module):
     self.qkv_dot_general_cls = qkv_dot_general_cls
     self.out_dot_general_cls = out_dot_general_cls
     self.epsilon = epsilon
+    self.use_softermax = use_softermax
     self.use_alpha = use_alpha
     self.alpha_init = alpha_init
     self.use_dropconnect = use_dropconnect
@@ -451,8 +460,6 @@ class MultiHeadAttention(Module):
       self.key_ln = None
     # Remove the output layer - no more self.out
-    self.rngs = rngs if dropout_rate > 0.0 else None
     self.cached_key: nnx.Cache[Array] | None = None
     self.cached_value: nnx.Cache[Array] | None = None
     self.cache_index: nnx.Cache[Array] | None = None
@@ -498,8 +505,6 @@ class MultiHeadAttention(Module):
     Returns:
       output of shape `[batch_sizes..., length, features]`.
     """
-    if rngs is None:
-      rngs = self.rngs
     if inputs_k is None:
       if inputs_v is not None:
@@ -621,6 +626,7 @@ class MultiHeadAttention(Module):
       precision=self.precision,
       module=self if sow_weights else None,
       epsilon=self.epsilon,  # Pass epsilon to yat_attention
+      use_softermax=self.use_softermax,
     )
     # Reshape attention output back to original embedding dimension
     # from [batch..., length, num_heads, head_dim] to [batch..., length, qkv_features]

nmn 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

nmn 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl