PyPI - nmn - Versions diffs - 0.1.5__tar.gz → 0.1.7__tar.gz - Mend

nmn 0.1.5tar.gz → 0.1.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{nmn-0.1.5 → nmn-0.1.7}/PKG-INFO +1 -1
{nmn-0.1.5 → nmn-0.1.7}/pyproject.toml +1 -1
nmn-0.1.7/src/nmn/nnx/loss/__init__.py +0 -0
nmn-0.1.7/src/nmn/nnx/squashers/__init__.py +9 -0
nmn-0.1.7/src/nmn/nnx/squashers/soft_tanh.py +29 -0
nmn-0.1.7/src/nmn/nnx/squashers/softer_sigmoid.py +29 -0
nmn-0.1.7/src/nmn/nnx/squashers/softermax.py +38 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/yatattention.py +47 -111
{nmn-0.1.5 → nmn-0.1.7}/.github/workflows/publish.yml +0 -0
{nmn-0.1.5 → nmn-0.1.7}/.gitignore +0 -0
{nmn-0.1.5 → nmn-0.1.7}/LICENSE +0 -0
{nmn-0.1.5 → nmn-0.1.7}/MANIFEST.in +0 -0
{nmn-0.1.5 → nmn-0.1.7}/PUBLISH.md +0 -0
{nmn-0.1.5 → nmn-0.1.7}/README.md +0 -0
{nmn-0.1.5 → nmn-0.1.7}/hatch.toml +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/__init__.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/keras/nmn.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/linen/nmn.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/examples/language/mingpt.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/examples/vision/cnn_cifar.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/nmn.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/yatconv.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/tf/nmn.py +0 -0
{nmn-0.1.5 → nmn-0.1.7}/src/nmn/torch/nmn.py +0 -0

{nmn-0.1.5 → nmn-0.1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nmn
-Version: 0.1.5
+Version: 0.1.7
 Summary: a neuron that matter
 Project-URL: Homepage, https://github.com/mlnomadpy/nmn
 Project-URL: Bug Tracker, https://github.com/mlnomadpy/my_package/issues

{nmn-0.1.5 → nmn-0.1.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "nmn"
-version = "0.1.5"
+version = "0.1.7"
 authors = [
     { name="Taha Bouhsine", email="yat@mlnomads.com" },
 ]

nmn-0.1.7/src/nmn/nnx/loss/__init__.py ADDED Viewed

File without changes

nmn-0.1.7/src/nmn/nnx/squashers/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .softermax import softermax
+from .softer_sigmoid import softer_sigmoid
+from .soft_tanh import soft_tanh
+__all__ = [
+    "softermax",
+    "softer_sigmoid",
+    "soft_tanh",
+]

nmn-0.1.7/src/nmn/nnx/squashers/soft_tanh.py ADDED Viewed

@@ -0,0 +1,29 @@
+import jax.numpy as jnp
+from jax import Array
+def soft_tanh(
+    x: Array,
+    n: float = 1.0,
+) -> Array:
+    """
+    Maps a non-negative score to the range [-1, 1) using the soft-tanh function.
+    The soft-tanh function is defined as:
+    .. math::
+        \\text{soft-tanh}_n(x) = \\frac{x^n - 1}{1 + x^n}
+    The power `n` again controls the transition sharpness: higher `n` makes the
+    function approach -1 more quickly for large `x`.
+    Args:
+        x (Array): A JAX array of non-negative scores (x >= 0).
+        n (float, optional): The power to raise the score to. Defaults to 1.0.
+    Returns:
+        Array: The mapped scores in the range [-1, 1).
+    """
+    if n <= 0:
+        raise ValueError("Power 'n' must be positive.")
+    x_n = jnp.power(x, n)
+    return (x_n - 1.0) / (1.0 + x_n)

nmn-0.1.7/src/nmn/nnx/squashers/softer_sigmoid.py ADDED Viewed

@@ -0,0 +1,29 @@
+import jax.numpy as jnp
+from jax import Array
+def softer_sigmoid(
+    x: Array,
+    n: float = 1.0,
+) -> Array:
+    """
+    Squashes a non-negative score into the range [0, 1) using the soft-sigmoid function.
+    The soft-sigmoid function is defined as:
+    .. math::
+        \\text{soft-sigmoid}_n(x) = \\frac{x^n}{1 + x^n}
+    The power `n` modulates the softness: higher `n` makes the function approach
+    zero faster for large `x`, while `n < 1` makes the decay slower.
+    Args:
+        x (Array): A JAX array of non-negative scores (x >= 0).
+        n (float, optional): The power to raise the score to. Defaults to 1.0.
+    Returns:
+        Array: The squashed scores in the range [0, 1).
+    """
+    if n <= 0:
+        raise ValueError("Power 'n' must be positive.")
+    x_n = jnp.power(x, n)
+    return x_n / (1.0 + x_n)

nmn-0.1.7/src/nmn/nnx/squashers/softermax.py ADDED Viewed

@@ -0,0 +1,38 @@
+import jax.numpy as jnp
+from jax import Array
+from typing import Optional
+def softermax(
+    x: Array,
+    n: float = 1.0,
+    epsilon: float = 1e-12,
+    axis: Optional[int] = -1,
+) -> Array:
+    """
+    Normalizes a set of non-negative scores using the Softermax function.
+    The Softermax function is defined as:
+    .. math::
+        \\text{softermax}_n(x_k, \\{x_i\\}) = \\frac{x_k^n}{\\epsilon + \\sum_i x_i^n}
+    The power `n` controls the sharpness of the distribution: `n=1` recovers
+    the original Softermax, while `n > 1` makes the distribution harder (more
+    peaked), and `0 < n < 1` makes it softer.
+    Args:
+        x (Array): A JAX array of non-negative scores.
+        n (float, optional): The power to raise each score to. Defaults to 1.0.
+        epsilon (float, optional): A small constant for numerical stability.
+            Defaults to 1e-12.
+        axis (Optional[int], optional): The axis to perform the sum over.
+            Defaults to -1.
+    Returns:
+        Array: The normalized scores.
+    """
+    if n <= 0:
+        raise ValueError("Power 'n' must be positive.")
+    x_n = jnp.power(x, n)
+    sum_x_n = jnp.sum(x_n, axis=axis, keepdims=True)
+    return x_n / (epsilon + sum_x_n)

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/yatattention.py RENAMED Viewed

@@ -26,8 +26,10 @@ from flax.typing import (
   DotGeneralT,
 )
+from nmn.nnx.nmn import YatNMN
+from jax import Array
+from nmn.nnx.squashers import softermax
 def yat_attention_weights(
   query: Array,
   key: Array,
@@ -41,6 +43,7 @@ def yat_attention_weights(
   precision: PrecisionLike = None,
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
+  use_softermax: bool = False,
 ):
   """Computes attention weights using YatNMN distance-based calculation."""
   query, key = promote_dtype((query, key), dtype=dtype)
@@ -85,7 +88,10 @@ def yat_attention_weights(
     attn_weights = jnp.where(mask, attn_weights, big_neg)
   # normalize the attention weights
-  attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+  if use_softermax:
+    attn_weights = softermax(attn_weights).astype(dtype)
+  else:
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
   if module:
     module.sow(nnx.Intermediate, 'attention_weights', attn_weights)
@@ -119,6 +125,7 @@ def yat_attention(
   precision: PrecisionLike = None,
   module: Optional[Module] = None,
   epsilon: float = 1e-5,
+  use_softermax: bool = False,
 ):
   """Computes attention using YatNMN distance-based calculation."""
   query, key, value = promote_dtype((query, key, value), dtype=dtype)
@@ -146,6 +153,7 @@ def yat_attention(
     precision,
     module,
     epsilon,
+    use_softermax,
   )
   # return weighted sum over values for each query position
@@ -153,91 +161,6 @@ def yat_attention(
     '...hqk,...khd->...qhd', attn_weights, value, precision=precision
   )
-Array = jax.Array
-# Add YatNMN class implementation
-default_bias_init = initializers.zeros_init()
-default_alpha_init = initializers.ones_init()
-class YatNMN(Module):
-  """A linear transformation with custom distance-based computation."""
-  def __init__(
-    self,
-    in_features: int,
-    out_features: int,
-    *,
-    use_bias: bool = True,
-    use_alpha: bool = True,
-    dtype: Optional[Dtype] = None,
-    param_dtype: Dtype = jnp.float32,
-    precision: PrecisionLike = None,
-    kernel_init: Initializer = default_kernel_init,
-    bias_init: Initializer = default_bias_init,
-    alpha_init: Initializer = default_alpha_init,
-    dot_general: DotGeneralT = lax.dot_general,
-    rngs: rnglib.Rngs,
-    epsilon: float = 1e-5,
-  ):
-    kernel_key = rngs.params()
-    self.kernel = nnx.Param(
-      kernel_init(kernel_key, (in_features, out_features), param_dtype)
-    )
-    self.bias: nnx.Param[jax.Array] | None
-    if use_bias:
-      bias_key = rngs.params()
-      self.bias = nnx.Param(bias_init(bias_key, (out_features,), param_dtype))
-    else:
-      self.bias = None
-    self.alpha: nnx.Param[jax.Array] | None
-    if use_alpha:
-      alpha_key = rngs.params()
-      self.alpha = nnx.Param(alpha_init(alpha_key, (1,), param_dtype))
-    else:
-      self.alpha = None
-    self.in_features = in_features
-    self.out_features = out_features
-    self.use_bias = use_bias
-    self.use_alpha = use_alpha
-    self.dtype = dtype
-    self.param_dtype = param_dtype
-    self.precision = precision
-    self.kernel_init = kernel_init
-    self.bias_init = bias_init
-    self.dot_general = dot_general
-    self.epsilon = epsilon
-  def __call__(self, inputs: Array) -> Array:
-    """Applies YatNMN transformation to inputs."""
-    kernel = self.kernel.value
-    bias = self.bias.value if self.bias is not None else None
-    alpha = self.alpha.value if self.alpha is not None else None
-    y = self.dot_general(
-      inputs,
-      kernel,
-      (((inputs.ndim - 1,), (0,)), ((), ())),
-      precision=self.precision,
-    )
-    inputs_squared_sum = jnp.sum(inputs**2, axis=-1, keepdims=True)
-    kernel_squared_sum = jnp.sum(kernel**2, axis=0, keepdims=True)
-    distances = inputs_squared_sum + kernel_squared_sum - 2 * y
-    # Element-wise operation
-    y = y ** 2 / (distances + self.epsilon)
-    if bias is not None:
-      y += jnp.reshape(bias, (1,) * (y.ndim - 1) + (-1,))
-    if alpha is not None:
-      scale = (jnp.sqrt(self.out_features) / jnp.log(1 + self.out_features)) ** alpha
-      y = y * scale
-    return y
 def dot_product_attention_weights(
@@ -435,6 +358,10 @@ class MultiHeadAttention(Module):
     attention_fn: Callable[..., Array] = yat_attention,
     decode: bool | None = None,
     normalize_qk: bool = False,
+    use_alpha: bool = True,
+    alpha_init: Initializer = initializers.ones_init(),
+    use_dropconnect: bool = False,
+    dropconnect_rate: float = 0.0,
     # Deprecated, will be removed.
     qkv_dot_general: DotGeneralT | None = None,
     out_dot_general: DotGeneralT | None = None,
@@ -442,6 +369,7 @@ class MultiHeadAttention(Module):
     out_dot_general_cls: Any = None,
     rngs: rnglib.Rngs,
     epsilon: float = 1e-5,
+    use_softermax: bool = False,
   ):
     self.num_heads = num_heads
     self.in_features = in_features
@@ -470,6 +398,11 @@ class MultiHeadAttention(Module):
     self.qkv_dot_general_cls = qkv_dot_general_cls
     self.out_dot_general_cls = out_dot_general_cls
     self.epsilon = epsilon
+    self.use_softermax = use_softermax
+    self.use_alpha = use_alpha
+    self.alpha_init = alpha_init
+    self.use_dropconnect = use_dropconnect
+    self.dropconnect_rate = dropconnect_rate
     if self.qkv_features % self.num_heads != 0:
       raise ValueError(
@@ -491,6 +424,10 @@ class MultiHeadAttention(Module):
       use_bias=self.use_bias,
       precision=self.precision,
       epsilon=self.epsilon,
+      use_alpha=self.use_alpha,
+      alpha_init=self.alpha_init,
+      use_dropconnect=self.use_dropconnect,
+      drop_rate=self.dropconnect_rate,
     )
     # project inputs_q to multi-headed q/k/v
@@ -590,10 +527,23 @@ class MultiHeadAttention(Module):
         f'but module expects {self.in_features}.'
       )
+    is_deterministic: bool = False
+    if self.dropout_rate > 0.0 or (
+      self.use_dropconnect and self.dropconnect_rate > 0.0
+    ):
+      is_deterministic = first_from(
+        deterministic,
+        self.deterministic,
+        error_msg="""No `deterministic` argument was provided to MultiHeadAttention
+          as either a __call__ argument, class attribute, or nnx.flag.""",
+      )
+    else:
+      is_deterministic = True
     # Apply YatNMN transformations and reshape to multi-head format
-    query = squash(self.query(inputs_q))
-    key = squash(self.key(inputs_k))
-    value = squash(self.value(inputs_v))
+    query = self.query(inputs_q, deterministic=is_deterministic)
+    key = self.key(inputs_k, deterministic=is_deterministic)
+    value = self.value(inputs_v, deterministic=is_deterministic)
     # Reshape from [batch..., length, qkv_features] to [batch..., length, num_heads, head_dim]
     query = query.reshape(query.shape[:-1] + (self.num_heads, self.head_dim))
@@ -660,26 +610,11 @@ class MultiHeadAttention(Module):
         ),
       )
-    if (
-      self.dropout_rate > 0.0
-    ):  # Require `deterministic` only if using dropout.
-      deterministic = first_from(
-        deterministic,
-        self.deterministic,
-        error_msg="""No `deterministic` argument was provided to MultiHeadAttention
-          as either a __call__ argument, class attribute, or nnx.flag.""",
-      )
-      if not deterministic:
-        if rngs is None:
-          raise ValueError(
-            "'rngs' must be provided if 'dropout_rng' is not given."
-          )
-        dropout_rng = rngs.dropout()
-      else:
-        dropout_rng = None
-    else:
-      deterministic = True
-      dropout_rng = None
+    dropout_rng = None
+    if self.dropout_rate > 0.0 and not is_deterministic:
+      if rngs is None:
+        raise ValueError("'rngs' must be provided for dropout.")
+      dropout_rng = rngs.dropout()
     # apply attention with epsilon parameter for YatNMN
     x = self.attention_fn(
@@ -690,11 +625,12 @@ class MultiHeadAttention(Module):
       dropout_rng=dropout_rng,
       dropout_rate=self.dropout_rate,
       broadcast_dropout=self.broadcast_dropout,
-      deterministic=deterministic,
+      deterministic=is_deterministic,
       dtype=self.dtype,
       precision=self.precision,
       module=self if sow_weights else None,
       epsilon=self.epsilon,  # Pass epsilon to yat_attention
+      use_softermax=self.use_softermax,
     )
     # Reshape attention output back to original embedding dimension
     # from [batch..., length, num_heads, head_dim] to [batch..., length, qkv_features]

{nmn-0.1.5 → nmn-0.1.7}/.github/workflows/publish.yml RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/.gitignore RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/LICENSE RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/MANIFEST.in RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/PUBLISH.md RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/README.md RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/hatch.toml RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/__init__.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/keras/nmn.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/linen/nmn.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/examples/language/mingpt.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/examples/vision/cnn_cifar.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/nmn.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/nnx/yatconv.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/tf/nmn.py RENAMED Viewed

File without changes

{nmn-0.1.5 → nmn-0.1.7}/src/nmn/torch/nmn.py RENAMED Viewed

File without changes

nmn 0.1.5__tar.gz → 0.1.7__tar.gz

nmn 0.1.5tar.gz → 0.1.7tar.gz