PyPI - blksprs - Versions diffs - 1.1__tar.gz → 1.2.1__tar.gz - Mend

blksprs 1.1tar.gz → 1.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{blksprs-1.1 → blksprs-1.2.1}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.1
+Version: 1.2.1
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
 Project-URL: Bugtracker, https://github.com/FelixSchoen/blksprs/issues
-Requires-Python: >=3.12
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 Requires-Dist: torch
 Provides-Extra: test
@@ -21,6 +21,9 @@ Requires-Dist: pdoc3; extra == "deploy"
 # blksprs
+[![GitHub Release](https://img.shields.io/github/v/release/FelixSchoen/blksprs?include_prereleases&label=Latest%20Release)](https://github.com/FelixSchoen/blksprs/releases)
+[![Python Version](https://img.shields.io/badge/Python%20Version-3.11-blue)](https://www.python.org/downloads/release/python-3119/)
 ## Overview
 A lightweight and efficient library for operations on block-sparse matrices in PyTorch using Triton.
@@ -33,7 +36,8 @@ Currently supported operations (includes gradient calculation):
 - Transposition
 - Gather
 - Scatter (_supports either no reduction or summation, gradients are only available for summation_)
-- Conversion from and to sparse form
+- Conversion to and from sparse form
+- Conversion to different sparsity layouts and different sparsity block sizes
 As with this library sparse matrices are represented using a tuple of `(matrix, sparsity_layout, sparsity_block_size)`,
 any element-wise operations can be applied in regular torch-like fashion.
@@ -59,6 +63,11 @@ We recommend installing blksprs from [PyPI](https://pypi.org/project/blksprs/) u
 ```pip install blksprs```
+### Dependencies
+- [PyTorch](https://pytorch.org/) (built with v2.4.0)
+- _[Triton](https://github.com/triton-lang/triton) (included with PyTorch)_
 ## Changelog
 See [`CHANGELOG.md`](https://github.com/FelixSchoen/blksprs/blob/main/CHANGELOG.md) for a detailed changelog.

{blksprs-1.1 → blksprs-1.2.1}/README.md RENAMED Viewed

@@ -1,5 +1,8 @@
 # blksprs
+[![GitHub Release](https://img.shields.io/github/v/release/FelixSchoen/blksprs?include_prereleases&label=Latest%20Release)](https://github.com/FelixSchoen/blksprs/releases)
+[![Python Version](https://img.shields.io/badge/Python%20Version-3.11-blue)](https://www.python.org/downloads/release/python-3119/)
 ## Overview
 A lightweight and efficient library for operations on block-sparse matrices in PyTorch using Triton.
@@ -12,7 +15,8 @@ Currently supported operations (includes gradient calculation):
 - Transposition
 - Gather
 - Scatter (_supports either no reduction or summation, gradients are only available for summation_)
-- Conversion from and to sparse form
+- Conversion to and from sparse form
+- Conversion to different sparsity layouts and different sparsity block sizes
 As with this library sparse matrices are represented using a tuple of `(matrix, sparsity_layout, sparsity_block_size)`,
 any element-wise operations can be applied in regular torch-like fashion.
@@ -38,6 +42,11 @@ We recommend installing blksprs from [PyPI](https://pypi.org/project/blksprs/) u
 ```pip install blksprs```
+### Dependencies
+- [PyTorch](https://pytorch.org/) (built with v2.4.0)
+- _[Triton](https://github.com/triton-lang/triton) (included with PyTorch)_
 ## Changelog
 See [`CHANGELOG.md`](https://github.com/FelixSchoen/blksprs/blob/main/CHANGELOG.md) for a detailed changelog.

blksprs-1.2.1/blksprs/layouting/sparsity_layout.py ADDED Viewed

@@ -0,0 +1,190 @@
+import math
+import torch
+import triton
+from torch import Tensor
+from triton import language as tl
+from blksprs.utils.tools import get_triton_block_size
+from blksprs.utils.validation import validate_triton_block_size, validate_dimensions, validate_device, \
+    validate_contiguous, validate_sparsity, validate_sparsity_block_size
+def build_sparsity_layout(x: Tensor, sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
+    """Builds the sparsity layout of a dense tensor in regular form covering its sparse blocks.
+    Args:
+        x (Tensor): A block-sparse (or dense) tensor in regular form.
+        sparsity_block_size (int): The size of the sparsity blocks.
+        triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
+    Returns:
+        Tensor: The sparsity layout of the input block-sparse (or dense) tensor.
+    """
+    validate_dimensions(x)
+    validate_contiguous(x)
+    validate_device(x)
+    output = torch.zeros(x.size(0), x.size(1) // sparsity_block_size, x.size(2) // sparsity_block_size,
+                         device=x.device, dtype=torch.int32)
+    x_b, x_r, x_c = x.size()
+    x_b_s, x_r_s, x_c_s = x.stride()
+    o_b, o_r, o_c = output.size()
+    o_b_s, o_r_s, o_c_s = output.stride()
+    if triton_block_size is None:
+        triton_block_size = get_triton_block_size(sparsity_block_size)
+    validate_triton_block_size(triton_block_size, sparsity_block_size)
+    triton_grid = lambda meta: [x_b,
+                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+    (kernel_sparsity_layout[triton_grid]
+     (x,
+      x_b, x_b_s, x_r_s, x_c_s,
+      output,
+      o_b, o_b_s, o_r_s, o_c_s,
+      sparsity_block_size,
+      triton_block_size))
+    return output
+@triton.jit
+def kernel_sparsity_layout(x,
+                           x_b, x_b_s, x_r_s, x_c_s,
+                           o,
+                           o_b, o_b_s, o_r_s, o_c_s,
+                           sparsity_block_size,
+                           TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+    # Get triton block indices
+    pid_bat = tl.program_id(axis=0)
+    pid_row = tl.program_id(axis=1)
+    pid_col = tl.program_id(axis=2)
+    # Load x values
+    blk_x_idx = (pid_bat * x_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx < x_b * x_b_s)
+    blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+    # Store sparsity layout value
+    if tl.min(blk_x) != 0 or tl.max(blk_x) != 0:
+        blk_o_idx = (pid_bat * o_b_s +
+                     (((pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_r_s +
+                      ((pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_c_s))
+        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        tl.store(o + blk_o_idx, 1, mask=blk_o_msk)
+def build_sparsity_layout_adaption(x: Tensor, sparsity_layout_from: Tensor,
+                                   sparsity_block_size_from: int, sparsity_block_size_to: int,
+                                   triton_block_size: int = None) -> Tensor:
+    """Builds the sparsity layout of a block-sparse tensor in compressed form if a different sparsity block size were
+        used.
+    Args:
+        x (Tensor): A block-sparse tensor in compressed form.
+        sparsity_layout_from (Tensor): The sparsity layout of the input block-sparse tensor.
+        sparsity_block_size_from (int): The size of the sparsity blocks of the input tensor.
+        sparsity_block_size_to (int): The desired size of the sparsity blocks for the resulting layout.
+        triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
+    Returns:
+        Tensor: The sparsity layout in regular form using the new sparsity block size of the input block-sparse tensor
+            in compressed form.
+    """
+    validate_dimensions(x)
+    validate_contiguous(x, sparsity_layout_from)
+    validate_device(x)
+    validate_sparsity(sparsity_block_size_from, (x, sparsity_layout_from))
+    validate_sparsity_block_size(sparsity_block_size_from, x)
+    validate_sparsity_block_size(sparsity_block_size_to)
+    min_sparsity_block_size = min(sparsity_block_size_from, sparsity_block_size_to)
+    validate_triton_block_size(triton_block_size, min_sparsity_block_size)
+    sparsity_lut = torch.nonzero(sparsity_layout_from).contiguous()
+    validate_contiguous(sparsity_layout_from, sparsity_lut)
+    o_b = sparsity_layout_from.size(0)
+    o_r = math.ceil(sparsity_layout_from.size(1) * sparsity_block_size_from // sparsity_block_size_to)
+    o_c = math.ceil(sparsity_layout_from.size(2) * sparsity_block_size_from // sparsity_block_size_to)
+    output = torch.zeros(o_b, o_r, o_c, device=x.device, dtype=torch.int32)
+    x_b, x_r, x_c = x.size()
+    x_b_s, x_r_s, x_c_s = x.stride()
+    s_lut_r, s_lut_c = sparsity_lut.size()
+    s_lut_r_s, s_lut_c_s = sparsity_lut.stride()
+    o_b_s, o_r_s, o_c_s = output.stride()
+    if triton_block_size is None:
+        triton_block_size = get_triton_block_size(sparsity_block_size_from)
+    triton_grid = lambda meta: [x_b,
+                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+    (kernel_sparsity_layout_adaption[triton_grid]
+     (x,
+      x_b, x_b_s, x_r_s, x_c_s,
+      sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+      output,
+      o_b, o_b_s, o_r_s, o_c_s,
+      sparsity_block_size_from,
+      sparsity_block_size_to,
+      triton_block_size))
+    return output
+@triton.jit
+def kernel_sparsity_layout_adaption(x,
+                                    x_b, x_b_s, x_r_s, x_c_s,
+                                    s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                    o,
+                                    o_b, o_b_s, o_r_s, o_c_s,
+                                    sparsity_block_size_from,
+                                    sparsity_block_size_to,
+                                    TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+    # Get triton block indices
+    pid_blk = tl.program_id(axis=0)
+    pid_row = tl.program_id(axis=1)
+    pid_col = tl.program_id(axis=2)
+    # Get sparsity index of current output block consisting of its batch, row, and column index
+    spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
+    spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+    spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
+    spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
+    spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+    spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
+    spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
+    spa_col_msk = (spa_col_idx < s_lut_r * s_lut_r_s)
+    spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
+    # Load x values
+    blk_x_idx = ((pid_blk * x_b_s) +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx < x_b * x_b_s)
+    blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+    # Store sparsity layout value
+    if tl.min(blk_x) != 0 or tl.max(blk_x) != 0:
+        blk_o_idx = ((spa_bat * o_b_s) +
+                     (((spa_row * sparsity_block_size_from + pid_row * TRITON_BLOCK_SIZE)
+                       // sparsity_block_size_to) * o_r_s) +
+                     (((spa_col * sparsity_block_size_from + pid_col * TRITON_BLOCK_SIZE)
+                       // sparsity_block_size_to) * o_c_s))
+        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        tl.store(o + blk_o_idx, 1, mask=blk_o_msk)

{blksprs-1.1 → blksprs-1.2.1}/blksprs/ops/conversion.py RENAMED Viewed

@@ -1,8 +1,11 @@
+from typing import Any
 import torch
 import triton
 from torch import Tensor
 from triton import language as tl
+from blksprs.layouting.sparsity_layout import build_sparsity_layout_adaption
 from blksprs.utils.tools import get_triton_block_size
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_triton_block_size
@@ -39,6 +42,9 @@ def to_dense(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int, fill_
     validate_contiguous(sparsity_reverse_lut)
+    if sparsity_layout.size(1) == 1 and sparsity_layout.size(2) == 1 and torch.all(sparsity_layout):
+        return x
     return _BlocksparseToDense.apply(x,
                                      sparsity_layout, sparsity_reverse_lut,
                                      sparsity_block_size, fill_value,
@@ -161,6 +167,9 @@ def to_sparse(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int, trit
     validate_contiguous(sparsity_layout, sparsity_lut)
+    if sparsity_layout.size(1) == 1 and sparsity_layout.size(2) == 1 and torch.all(sparsity_layout):
+        return x
     return _BlocksparseToSparse.apply(x,
                                       sparsity_layout, sparsity_lut,
                                       sparsity_block_size, n_sparse_blocks,
@@ -178,10 +187,10 @@ class _BlocksparseToSparse(torch.autograd.Function):
         x_b, x_r, x_c = x.size()
         x_b_s, x_r_s, x_c_s = x.stride()
-        o_b, o_r, o_c = output.size()
-        o_b_s, o_r_s, o_c_s = output.stride()
         s_lut_r, s_lut_c = sparsity_lut.size()
         s_lut_r_s, s_lut_c_s = sparsity_lut.stride()
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = output.stride()
         if triton_block_size is None:
             triton_block_size = get_triton_block_size(sparsity_block_size)
@@ -254,3 +263,189 @@ class _BlocksparseToSparse(torch.autograd.Function):
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE) * o_c_s))[None, :])
         blk_o_msk = (blk_o_idx < (pid_blk + 1) * o_b_s)
         tl.store(o + blk_o_idx, blk_d, mask=blk_o_msk)
+def adapt_layout(x: Tensor, sparsity_layout_from: Tensor, sparsity_block_size_from: int, sparsity_block_size_to: int,
+                 preprocess_data: dict = None, triton_block_size: int = None) -> Tensor:
+    """Adapts the sparsity layout of a block-sparse tensor, resulting in a new block-sparse tensor in compressed form
+        conforming to the new sparsity layout (and sparsity block size) definition.
+    Args:
+        x (Tensor): A block-sparse tensor in compressed form.
+        sparsity_layout_from (Tensor): The sparsity layout of the input block-sparse tensor.
+        sparsity_block_size_from (int): The size of the sparsity blocks of the input sparsity layout.
+        sparsity_block_size_to (int): The size of the sparsity blocks of the output sparsity layout.
+        preprocess_data (dict): A dictionary containing data otherwise computed by the function (default ``None``).
+        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
+    Returns:
+        Tensor: The block-sparse tensor in compressed form with the adapted sparsity layout and sparsity block size.
+    """
+    validate_dimensions(x)
+    validate_contiguous(x, sparsity_layout_from)
+    validate_device(x)
+    validate_sparsity(sparsity_block_size_from, (x, sparsity_layout_from))
+    validate_sparsity_block_size(sparsity_block_size_from, x)
+    validate_sparsity_block_size(sparsity_block_size_to)
+    min_sparsity_block_size = min(sparsity_block_size_from, sparsity_block_size_to)
+    validate_triton_block_size(triton_block_size, min_sparsity_block_size)
+    if preprocess_data is None:
+        preprocess_data = {}
+    if "sparsity_reverse_lut_from" not in preprocess_data:
+        sparsity_layout_from_flat = sparsity_layout_from.reshape(-1)
+        sparsity_reverse_lut_from = ((torch.cumsum(sparsity_layout_from_flat, dim=-1) - 1) *
+                                     (sparsity_layout_from_flat == 1) -
+                                     (1 * (sparsity_layout_from_flat == 0)))
+    else:
+        sparsity_reverse_lut_from = preprocess_data["sparsity_reverse_lut_from"]
+    if "sparsity_layout_to" not in preprocess_data:
+        sparsity_layout_to = build_sparsity_layout_adaption(x, sparsity_layout_from,
+                                                            sparsity_block_size_from, sparsity_block_size_to,
+                                                            triton_block_size)
+    else:
+        sparsity_layout_to = preprocess_data["sparsity_layout_to"]
+    if "sparsity_lut_to" not in preprocess_data:
+        sparsity_lut_to = torch.nonzero(sparsity_layout_to).contiguous()
+    else:
+        sparsity_lut_to = preprocess_data["sparsity_lut_to"]
+    if "n_sparse_blocks_to" not in preprocess_data:
+        n_sparse_blocks_to = torch.sum(sparsity_layout_to.to(torch.int)).item()
+    else:
+        n_sparse_blocks_to = preprocess_data["n_sparse_blocks_to"]
+    validate_contiguous(sparsity_layout_to, sparsity_reverse_lut_from, sparsity_lut_to)
+    if (sparsity_block_size_from == sparsity_block_size_to) and torch.equal(sparsity_layout_from, sparsity_layout_to):
+        return x
+    return _BlocksparseAdaptLayout.apply(x,
+                                         sparsity_layout_from, sparsity_reverse_lut_from, sparsity_block_size_from,
+                                         sparsity_layout_to, sparsity_lut_to, sparsity_block_size_to,
+                                         n_sparse_blocks_to, min_sparsity_block_size, triton_block_size)
+class _BlocksparseAdaptLayout(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor,
+                sparsity_layout_from: Tensor, sparsity_reverse_lut_from: Tensor, sparsity_block_size_from: int,
+                sparsity_layout_to: Tensor, sparsity_lut_to: Tensor, sparsity_block_size_to: int,
+                n_sparse_blocks_to: int, min_sparsity_block_size: int, triton_block_size: int) -> Tensor:
+        output = torch.zeros(size=(n_sparse_blocks_to, sparsity_block_size_to, sparsity_block_size_to),
+                             dtype=x.dtype, device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = x.stride()
+        s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout_from.size()
+        s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = sparsity_layout_from.stride()
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = output.stride()
+        s_lut_o_r, s_lut_o_c = sparsity_lut_to.size()
+        s_lut_o_r_s, s_lut_o_c_s = sparsity_lut_to.stride()
+        if triton_block_size is None:
+            triton_block_size = get_triton_block_size(min_sparsity_block_size)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (_BlocksparseAdaptLayout.kernel_adapt_layout[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
+          sparsity_reverse_lut_from,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_lut_to, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+          sparsity_block_size_from,
+          sparsity_block_size_to,
+          triton_block_size))
+        ctx.save_for_backward(x, sparsity_layout_from, sparsity_layout_to)
+        ctx.sparsity_block_size_from = sparsity_block_size_from
+        ctx.sparsity_block_size_to = sparsity_block_size_to
+        ctx.triton_block_size = triton_block_size
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        x, sparsity_layout_from, sparsity_layout_to = ctx.saved_tensors
+        sparsity_block_size_from = ctx.sparsity_block_size_from
+        sparsity_block_size_to = ctx.sparsity_block_size_to
+        triton_block_size = ctx.triton_block_size
+        return adapt_layout(grad_output, sparsity_layout_to, sparsity_block_size_to, sparsity_block_size_from,
+                            preprocess_data={"sparsity_layout_to": sparsity_layout_from},
+                            triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None
+    @staticmethod
+    @triton.jit
+    def kernel_adapt_layout(x,
+                            x_b, x_b_s, x_r_s, x_c_s,
+                            s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
+                            r_lut_x,
+                            o,
+                            o_b, o_b_s, o_r_s, o_c_s,
+                            s_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+                            sparsity_block_size_from,
+                            sparsity_block_size_to,
+                            TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+        # Get triton block indices
+        pid_blk = tl.program_id(axis=0)
+        pid_row = tl.program_id(axis=1)
+        pid_col = tl.program_id(axis=2)
+        # Get position of current sparsity block consisting of its batch, row, and column index
+        spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
+        spa_bat_o_msk = (spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_bat_o = tl.load(s_lut_o + spa_bat_o_idx, mask=spa_bat_o_msk)
+        spa_row_o_idx = (pid_blk * s_lut_o_r_s + 1 * s_lut_o_c_s)
+        spa_row_o_msk = (spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
+        spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
+        spa_col_o_msk = (spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
+        # Get equivalent sparsity block in from layout
+        spa_bat_x = spa_bat_o
+        spa_row_x = (spa_row_o * sparsity_block_size_to + pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size_from
+        spa_col_x = (spa_col_o * sparsity_block_size_to + pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size_from
+        # # Get reverse sparsity indices for x
+        rev_idx_spa_x_idx = (spa_bat_x * s_l_x_b_s +
+                             spa_row_x * s_l_x_r_s +
+                             spa_col_x * s_l_x_c_s)
+        rev_idx_spa_x_msk = (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
+        rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
+        # If block is present commence operations
+        if rev_idx_spa_x >= 0:
+            # Calculate triton block size shifts
+            shift_row_x = ((spa_row_o * sparsity_block_size_to + pid_row * TRITON_BLOCK_SIZE)
+                           % sparsity_block_size_from) // TRITON_BLOCK_SIZE
+            shift_col_x = ((spa_col_o * sparsity_block_size_to + pid_col * TRITON_BLOCK_SIZE)
+                           % sparsity_block_size_from) // TRITON_BLOCK_SIZE
+            # Load x values
+            blk_x_idx = ((rev_idx_spa_x * x_b_s) +
+                         ((shift_row_x * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                         ((shift_col_x * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+            blk_x_msk = (blk_x_idx < x_b * x_b_s)
+            blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+            # Store output
+            blk_o_idx = ((pid_blk * o_b_s) +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                         ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+            blk_o_msk = (blk_o_idx < o_b * o_b_s)
+            tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

{blksprs-1.1 → blksprs-1.2.1}/blksprs.egg-info/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.1
+Version: 1.2.1
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
 Project-URL: Bugtracker, https://github.com/FelixSchoen/blksprs/issues
-Requires-Python: >=3.12
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 Requires-Dist: torch
 Provides-Extra: test
@@ -21,6 +21,9 @@ Requires-Dist: pdoc3; extra == "deploy"
 # blksprs
+[![GitHub Release](https://img.shields.io/github/v/release/FelixSchoen/blksprs?include_prereleases&label=Latest%20Release)](https://github.com/FelixSchoen/blksprs/releases)
+[![Python Version](https://img.shields.io/badge/Python%20Version-3.11-blue)](https://www.python.org/downloads/release/python-3119/)
 ## Overview
 A lightweight and efficient library for operations on block-sparse matrices in PyTorch using Triton.
@@ -33,7 +36,8 @@ Currently supported operations (includes gradient calculation):
 - Transposition
 - Gather
 - Scatter (_supports either no reduction or summation, gradients are only available for summation_)
-- Conversion from and to sparse form
+- Conversion to and from sparse form
+- Conversion to different sparsity layouts and different sparsity block sizes
 As with this library sparse matrices are represented using a tuple of `(matrix, sparsity_layout, sparsity_block_size)`,
 any element-wise operations can be applied in regular torch-like fashion.
@@ -59,6 +63,11 @@ We recommend installing blksprs from [PyPI](https://pypi.org/project/blksprs/) u
 ```pip install blksprs```
+### Dependencies
+- [PyTorch](https://pytorch.org/) (built with v2.4.0)
+- _[Triton](https://github.com/triton-lang/triton) (included with PyTorch)_
 ## Changelog
 See [`CHANGELOG.md`](https://github.com/FelixSchoen/blksprs/blob/main/CHANGELOG.md) for a detailed changelog.

{blksprs-1.1 → blksprs-1.2.1}/pyproject.toml RENAMED Viewed

@@ -1,10 +1,10 @@
 [project]
 name = "blksprs"
-version = "1.1"
+version = "1.2.1"
 authors = [{ name = "Felix Schön", email = "schoen@kr.tuwien.ac.at" }]
 description = "A lightweight library for operations on blocksparse matrices in PyTorch."
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.11"
 license = { file = "LICENSE.md" }
 dependencies = [
     "torch"

blksprs-1.1/blksprs/layouting/sparsity_layout.py DELETED Viewed

@@ -1,78 +0,0 @@
-import torch
-import triton
-from torch import Tensor
-from triton import language as tl
-from blksprs.utils.tools import get_triton_block_size
-from blksprs.utils.validation import validate_triton_block_size, validate_dimensions, validate_device, \
-    validate_contiguous
-def build_sparsity_layout(x: Tensor, sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
-    """Builds the sparsity layout of a dense tensor covering its sparse blocks.
-    Args:
-        x (Tensor): A block-sparse (or dense) tensor in regular form.
-        sparsity_block_size (int): The size of the sparsity blocks.
-        triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
-    Returns:
-        Tensor: The sparsity layout of the input block-sparse (or dense) tensor.
-    """
-    validate_dimensions(x)
-    validate_contiguous(x)
-    validate_device(x)
-    output = torch.zeros(x.size(0), x.size(1) // sparsity_block_size, x.size(2) // sparsity_block_size,
-                         device=x.device, dtype=torch.int32)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = x.stride()
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = output.stride()
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
-    triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_sparsity_layout[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_block_size,
-      triton_block_size))
-    return output
-@triton.jit
-def kernel_sparsity_layout(x,
-                           x_b, x_b_s, x_r_s, x_c_s,
-                           o,
-                           o_b, o_b_s, o_r_s, o_c_s,
-                           sparsity_block_size,
-                           TRITON_BLOCK_SIZE: tl.constexpr) -> None:
-    # Get triton block indices
-    pid_bat = tl.program_id(axis=0)
-    pid_row = tl.program_id(axis=1)
-    pid_col = tl.program_id(axis=2)
-    blk_x_idx = (pid_bat * x_b_s +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx < x_b * x_b_s)
-    blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
-    if tl.min(blk_x) != 0 or tl.max(blk_x) != 0:
-        blk_o_idx = (pid_bat * o_b_s +
-                     (((pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_r_s +
-                      ((pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_c_s))
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
-        tl.store(o + blk_o_idx, 1, mask=blk_o_msk)