PyPI - adasplash - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

adasplash 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{adasplash-0.2.1 → adasplash-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: adasplash
-Version: 0.2.1
+Version: 0.2.2
 Summary: AdaSplash: Efficient Adaptive Sparse Attention in Triton
 Home-page: https://github.com/deep-spin/adasplash
 Author: Nuno Gonçalves, Marcos Treviso
@@ -93,7 +93,6 @@ from adasplash import (
     triton_entmax_v2,
     triton_sparsemax,
     triton_entmax15,
-    entmax_attention,
 )
 ```
@@ -108,7 +107,6 @@ from adasplash import (
 | `triton_entmax_v1` | Original entmax implementation. |
 | `triton_sparsemax` | Convenience v2 sparsemax call, equivalent to entmax with `alpha=2.0`. |
 | `triton_entmax15` | Convenience v2 entmax-1.5 call. |
-| `entmax_attention` | Dense attention utility using v2 `triton_entmax`. |
 ## Sparse Attention Examples
@@ -188,23 +186,29 @@ y_entmax15 = triton_entmax15(x)
 For generic alpha values other than `1.5` and `2.0`, v2 disables histogram initialization internally and uses more refinement iterations for correctness.
-## Dense Entmax Attention Utility
+## Attention Examples
+The `examples/attention.py` file contains two small helpers that show the difference between the fused AdaSplash kernel and a dense reference-style implementation.
+### Flash Entmax Attention
 ```python
-from adasplash import entmax_attention
-out = entmax_attention(
-    q,
-    k,
-    v,
-    alpha=1.5,
-    is_causal=True,
-    varlen=None,
-    padding="right",
-)
+from examples.attention import flash_entmax_attention
+out = flash_entmax_attention(q, k, v, is_causal=True)
+```
+`flash_entmax_attention` is a thin example wrapper around `adasplash`, the actual fused flash entmax attention path.
+### Slow Dense Entmax Attention
+```python
+from examples.attention import slow_entmax_attention
+out = slow_entmax_attention(q, k, v, is_causal=True, padding="right")
 ```
-`entmax_attention` is a dense utility built on top of v2 `triton_entmax`. It supports causal masking, non-causal masking, variable lengths, left/right padding, ALiBi slopes, and gradients through `q`, `k`, and `v`.
+`slow_entmax_attention` materializes dense attention scores and applies `triton_entmax`. It is useful for examples and small correctness checks, but it is not the AdaSplash flash kernel and should not be used for long contexts.
 ## Backwards Compatibility

{adasplash-0.2.1 → adasplash-0.2.2}/README.md RENAMED Viewed

@@ -58,7 +58,6 @@ from adasplash import (
     triton_entmax_v2,
     triton_sparsemax,
     triton_entmax15,
-    entmax_attention,
 )
 ```
@@ -73,7 +72,6 @@ from adasplash import (
 | `triton_entmax_v1` | Original entmax implementation. |
 | `triton_sparsemax` | Convenience v2 sparsemax call, equivalent to entmax with `alpha=2.0`. |
 | `triton_entmax15` | Convenience v2 entmax-1.5 call. |
-| `entmax_attention` | Dense attention utility using v2 `triton_entmax`. |
 ## Sparse Attention Examples
@@ -153,23 +151,29 @@ y_entmax15 = triton_entmax15(x)
 For generic alpha values other than `1.5` and `2.0`, v2 disables histogram initialization internally and uses more refinement iterations for correctness.
-## Dense Entmax Attention Utility
+## Attention Examples
+The `examples/attention.py` file contains two small helpers that show the difference between the fused AdaSplash kernel and a dense reference-style implementation.
+### Flash Entmax Attention
 ```python
-from adasplash import entmax_attention
-out = entmax_attention(
-    q,
-    k,
-    v,
-    alpha=1.5,
-    is_causal=True,
-    varlen=None,
-    padding="right",
-)
+from examples.attention import flash_entmax_attention
+out = flash_entmax_attention(q, k, v, is_causal=True)
+```
+`flash_entmax_attention` is a thin example wrapper around `adasplash`, the actual fused flash entmax attention path.
+### Slow Dense Entmax Attention
+```python
+from examples.attention import slow_entmax_attention
+out = slow_entmax_attention(q, k, v, is_causal=True, padding="right")
 ```
-`entmax_attention` is a dense utility built on top of v2 `triton_entmax`. It supports causal masking, non-causal masking, variable lengths, left/right padding, ALiBi slopes, and gradients through `q`, `k`, and `v`.
+`slow_entmax_attention` materializes dense attention scores and applies `triton_entmax`. It is useful for examples and small correctness checks, but it is not the AdaSplash flash kernel and should not be used for long contexts.
 ## Backwards Compatibility

{adasplash-0.2.1 → adasplash-0.2.2}/adasplash/__init__.py RENAMED Viewed

@@ -68,22 +68,6 @@ def triton_entmax15(x, **kwargs):
     return triton_entmax15(x, **kwargs)
-def entmax_attention(q, k, v, alpha=1.5, varlen=None, is_causal=False, padding="right", niter=2, alibi_slopes=None):
-    from .attention import entmax_attention as _entmax_attention
-    return _entmax_attention(
-        q,
-        k,
-        v,
-        alpha=alpha,
-        varlen=varlen,
-        is_causal=is_causal,
-        padding=padding,
-        niter=niter,
-        alibi_slopes=alibi_slopes,
-    )
 adasplash2 = _adasplash_v2
 __all__ = [
@@ -98,5 +82,4 @@ __all__ = [
     "triton_entmax_v2",
     "triton_sparsemax",
     "triton_entmax15",
-    "entmax_attention",
 ]

{adasplash-0.2.1 → adasplash-0.2.2}/adasplash.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: adasplash
-Version: 0.2.1
+Version: 0.2.2
 Summary: AdaSplash: Efficient Adaptive Sparse Attention in Triton
 Home-page: https://github.com/deep-spin/adasplash
 Author: Nuno Gonçalves, Marcos Treviso
@@ -93,7 +93,6 @@ from adasplash import (
     triton_entmax_v2,
     triton_sparsemax,
     triton_entmax15,
-    entmax_attention,
 )
 ```
@@ -108,7 +107,6 @@ from adasplash import (
 | `triton_entmax_v1` | Original entmax implementation. |
 | `triton_sparsemax` | Convenience v2 sparsemax call, equivalent to entmax with `alpha=2.0`. |
 | `triton_entmax15` | Convenience v2 entmax-1.5 call. |
-| `entmax_attention` | Dense attention utility using v2 `triton_entmax`. |
 ## Sparse Attention Examples
@@ -188,23 +186,29 @@ y_entmax15 = triton_entmax15(x)
 For generic alpha values other than `1.5` and `2.0`, v2 disables histogram initialization internally and uses more refinement iterations for correctness.
-## Dense Entmax Attention Utility
+## Attention Examples
+The `examples/attention.py` file contains two small helpers that show the difference between the fused AdaSplash kernel and a dense reference-style implementation.
+### Flash Entmax Attention
 ```python
-from adasplash import entmax_attention
-out = entmax_attention(
-    q,
-    k,
-    v,
-    alpha=1.5,
-    is_causal=True,
-    varlen=None,
-    padding="right",
-)
+from examples.attention import flash_entmax_attention
+out = flash_entmax_attention(q, k, v, is_causal=True)
+```
+`flash_entmax_attention` is a thin example wrapper around `adasplash`, the actual fused flash entmax attention path.
+### Slow Dense Entmax Attention
+```python
+from examples.attention import slow_entmax_attention
+out = slow_entmax_attention(q, k, v, is_causal=True, padding="right")
 ```
-`entmax_attention` is a dense utility built on top of v2 `triton_entmax`. It supports causal masking, non-causal masking, variable lengths, left/right padding, ALiBi slopes, and gradients through `q`, `k`, and `v`.
+`slow_entmax_attention` materializes dense attention scores and applies `triton_entmax`. It is useful for examples and small correctness checks, but it is not the AdaSplash flash kernel and should not be used for long contexts.
 ## Backwards Compatibility

{adasplash-0.2.1 → adasplash-0.2.2}/adasplash.egg-info/SOURCES.txt RENAMED Viewed

@@ -6,7 +6,6 @@ adasplash/__init__.py
 adasplash/adasplash_block_mask.py
 adasplash/adasplash_no_block_mask.py
 adasplash/adasplash_v2.py
-adasplash/attention.py
 adasplash/triton_entmax.py
 adasplash/triton_entmax_v2.py
 adasplash.egg-info/PKG-INFO

{adasplash-0.2.1 → adasplash-0.2.2}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="adasplash",
-    version="0.2.1",
+    version="0.2.2",
     author="Nuno Gonçalves, Marcos Treviso",
     author_email="marcosvtreviso@gmail.com",
     description="AdaSplash: Efficient Adaptive Sparse Attention in Triton",

{adasplash-0.2.1 → adasplash-0.2.2}/tests/test_attention.py RENAMED Viewed

@@ -4,7 +4,7 @@ import pytest
 import torch
 from entmax import entmax_bisect
-import adasplash
+from examples.attention import flash_entmax_attention, slow_entmax_attention
 pytestmark = pytest.mark.gpu
@@ -59,7 +59,7 @@ def _run_attention_case(padding, is_causal):
     ref = reference_attention(q, k, v, varlen=varlen, is_causal=is_causal, padding=padding, alibi_slopes=alibi)
     ref_dq, ref_dk, ref_dv = torch.autograd.grad(ref, (q, k, v), do)
-    out = adasplash.entmax_attention(
+    out = slow_entmax_attention(
         q,
         k,
         v,
@@ -77,12 +77,31 @@ def _run_attention_case(padding, is_causal):
     assert torch.allclose(tri_dv, ref_dv, atol=1e-3, rtol=1e-3)
-def test_entmax_attention_fast_forward_backward_smoke():
+def test_slow_entmax_attention_fast_forward_backward_smoke():
     _run_attention_case(padding="right", is_causal=True)
+def test_flash_entmax_attention_example_smoke():
+    torch.manual_seed(42)
+    q = torch.randn(1, 1, 128, 32, device="cuda", dtype=torch.float32, requires_grad=True).contiguous()
+    k = torch.randn_like(q, requires_grad=True).contiguous()
+    v = torch.randn_like(q, requires_grad=True).contiguous()
+    do = torch.randn_like(q)
+    ref = reference_attention(q, k, v, is_causal=True)
+    ref_dq, ref_dk, ref_dv = torch.autograd.grad(ref, (q, k, v), do)
+    out = flash_entmax_attention(q, k, v, is_causal=True, niter=10)
+    tri_dq, tri_dk, tri_dv = torch.autograd.grad(out, (q, k, v), do)
+    assert torch.allclose(out, ref, atol=1e-4, rtol=1e-4)
+    assert torch.allclose(tri_dq, ref_dq, atol=1e-4, rtol=1e-4)
+    assert torch.allclose(tri_dk, ref_dk, atol=1e-4, rtol=1e-4)
+    assert torch.allclose(tri_dv, ref_dv, atol=1e-4, rtol=1e-4)
 @pytest.mark.slow
 @pytest.mark.parametrize("padding", ["left", "right"])
 @pytest.mark.parametrize("is_causal", [False, True])
-def test_entmax_attention_forward_backward_matches_reference(padding, is_causal):
+def test_slow_entmax_attention_forward_backward_matches_reference(padding, is_causal):
     _run_attention_case(padding=padding, is_causal=is_causal)

{adasplash-0.2.1 → adasplash-0.2.2}/tests/test_public_api.py RENAMED Viewed

@@ -16,7 +16,6 @@ def test_public_api_exports_are_lazy_and_versioned():
         "triton_entmax_v2",
         "triton_sparsemax",
         "triton_entmax15",
-        "entmax_attention",
     ]:
         assert name in adasplash.__all__
         assert callable(getattr(adasplash, name))
@@ -49,24 +48,13 @@ def test_dispatcher_signatures_are_stable():
         "use_histogram",
         "fast_math",
     ]
-    assert list(inspect.signature(adasplash.entmax_attention).parameters) == [
-        "q",
-        "k",
-        "v",
-        "alpha",
-        "varlen",
-        "is_causal",
-        "padding",
-        "niter",
-        "alibi_slopes",
-    ]
+    assert not hasattr(adasplash, "entmax_attention")
 def test_package_source_allowlist():
     package_dir = Path(adasplash.__file__).resolve().parent
     allowed = {
         "__init__.py",
-        "attention.py",
         "adasplash_block_mask.py",
         "adasplash_no_block_mask.py",
         "adasplash_v2.py",

adasplash-0.2.1/adasplash/attention.py DELETED Viewed

@@ -1,73 +0,0 @@
-import math
-import torch
-def _varlen_mask(varlen, size, padding):
-    positions = torch.arange(size, device=varlen.device)
-    if padding == "right":
-        return positions[None, :] < varlen[:, None]
-    if padding == "left":
-        return positions[None, :] >= size - varlen[:, None]
-    raise ValueError("padding must be either 'right' or 'left'.")
-def _alibi_bias(q, k, alibi_slopes):
-    _, n_heads, q_len, _ = q.shape
-    k_len = k.shape[-2]
-    if alibi_slopes.shape != (n_heads,):
-        raise ValueError(f"alibi_slopes must have shape ({n_heads},); got {tuple(alibi_slopes.shape)}.")
-    if q_len == 1 and k_len > 1:
-        rel_pos = torch.arange(k_len, device=q.device) - (k_len - 1)
-        rel_pos = rel_pos.view(1, 1, 1, k_len)
-    else:
-        q_pos = torch.arange(q_len, device=q.device)
-        k_pos = torch.arange(k_len, device=q.device)
-        rel_pos = k_pos[None, :] - q_pos[:, None]
-        rel_pos = rel_pos.view(1, 1, q_len, k_len)
-    return alibi_slopes.to(q.device).view(1, n_heads, 1, 1) * rel_pos
-def entmax_attention(q, k, v, alpha=1.5, varlen=None, is_causal=False, padding="right", niter=2, alibi_slopes=None):
-    """Dense QK attention using the public v2 Triton entmax activation."""
-    if q.dim() != 4 or k.dim() != 4 or v.dim() != 4:
-        raise ValueError("q, k and v must have shape (batch, heads, seq_len, head_dim).")
-    if k.shape != v.shape:
-        raise ValueError(f"k and v must have the same shape; got {tuple(k.shape)} and {tuple(v.shape)}.")
-    if q.shape[0] != k.shape[0] or q.shape[1] != k.shape[1] or q.shape[3] != k.shape[3]:
-        raise ValueError("q, k and v must agree on batch, heads and head_dim.")
-    _, _, q_len, head_dim = q.shape
-    k_len = k.shape[-2]
-    scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(head_dim)
-    if alibi_slopes is not None:
-        scores = scores + _alibi_bias(q, k, alibi_slopes)
-    if is_causal:
-        if q_len == k_len:
-            causal = torch.tril(torch.ones(q_len, k_len, device=q.device, dtype=torch.bool))
-        else:
-            q_pos = torch.arange(q_len, device=q.device) + (k_len - q_len)
-            k_pos = torch.arange(k_len, device=q.device)
-            causal = q_pos[:, None] >= k_pos[None, :]
-        scores = scores.masked_fill(~causal.view(1, 1, q_len, k_len), float("-inf"))
-    output_mask = None
-    if varlen is not None:
-        if varlen.dim() != 1 or varlen.shape[0] != q.shape[0]:
-            raise ValueError(f"varlen must be a 1-D tensor of shape ({q.shape[0]},).")
-        key_mask = _varlen_mask(varlen.to(q.device), k_len, padding)
-        scores = scores.masked_fill(~key_mask[:, None, None, :], float("-inf"))
-        if q_len == k_len:
-            output_mask = _varlen_mask(varlen.to(q.device), q_len, padding)[:, None, :, None]
-            scores = scores.masked_fill(~output_mask, 0.0)
-    from .triton_entmax_v2 import triton_entmax
-    probs = triton_entmax(scores.contiguous(), alpha=alpha, n_iter=niter, fast_math=False)
-    out = torch.matmul(probs, v)
-    if output_mask is not None:
-        out = out.masked_fill(~output_mask, 0)
-    return out