PyPI - nmn - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

nmn 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

nmn/nnx/TODO +2 -0
nmn/nnx/yatattention.py +8 -5
nmn/torch/conv.py +2105 -0
{nmn-0.1.7.dist-info → nmn-0.1.9.dist-info}/METADATA +1 -1
{nmn-0.1.7.dist-info → nmn-0.1.9.dist-info}/RECORD +7 -5
{nmn-0.1.7.dist-info → nmn-0.1.9.dist-info}/WHEEL +0 -0
{nmn-0.1.7.dist-info → nmn-0.1.9.dist-info}/licenses/LICENSE +0 -0

nmn/nnx/TODO ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ - add support to masked kernels
2	+ - explain attention [directed graph]

nmn/nnx/yatattention.py CHANGED Viewed

@@ -44,6 +44,8 @@ def yat_attention_weights(
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
   use_softermax: bool = False,
+  power: float = 1.0,
 ):
   """Computes attention weights using YatNMN distance-based calculation."""
   query, key = promote_dtype((query, key), dtype=dtype)
@@ -89,7 +91,7 @@ def yat_attention_weights(
   # normalize the attention weights
   if use_softermax:
-    attn_weights = softermax(attn_weights).astype(dtype)
+    attn_weights = softermax(attn_weights, n=power).astype(dtype)
   else:
     attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
@@ -126,6 +128,7 @@ def yat_attention(
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
   use_softermax: bool = False,
+  power: float = 1.0,
 ):
   """Computes attention using YatNMN distance-based calculation."""
   query, key, value = promote_dtype((query, key, value), dtype=dtype)
@@ -154,6 +157,7 @@ def yat_attention(
     module,
     epsilon,
     use_softermax,
+    power,
   )
   # return weighted sum over values for each query position
@@ -370,6 +374,7 @@ class MultiHeadAttention(Module):
     rngs: rnglib.Rngs,
     epsilon: float = 1e-5,
     use_softermax: bool = False,
+    power: float = 1.0,
   ):
     self.num_heads = num_heads
     self.in_features = in_features
@@ -399,6 +404,7 @@ class MultiHeadAttention(Module):
     self.out_dot_general_cls = out_dot_general_cls
     self.epsilon = epsilon
     self.use_softermax = use_softermax
+    self.power = power
     self.use_alpha = use_alpha
     self.alpha_init = alpha_init
     self.use_dropconnect = use_dropconnect
@@ -460,8 +466,6 @@ class MultiHeadAttention(Module):
       self.key_ln = None
     # Remove the output layer - no more self.out
-    self.rngs = rngs if dropout_rate > 0.0 else None
     self.cached_key: nnx.Cache[Array] | None = None
     self.cached_value: nnx.Cache[Array] | None = None
     self.cache_index: nnx.Cache[Array] | None = None
@@ -507,8 +511,6 @@ class MultiHeadAttention(Module):
     Returns:
       output of shape `[batch_sizes..., length, features]`.
     """
-    if rngs is None:
-      rngs = self.rngs
     if inputs_k is None:
       if inputs_v is not None:
@@ -631,6 +633,7 @@ class MultiHeadAttention(Module):
       module=self if sow_weights else None,
       epsilon=self.epsilon,  # Pass epsilon to yat_attention
       use_softermax=self.use_softermax,
+      power= self.power,
     )
     # Reshape attention output back to original embedding dimension
     # from [batch..., length, num_heads, head_dim] to [batch..., length, qkv_features]

nmn 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

nmn 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl