PyPI - blksprs - Versions diffs - 1.8.3__tar.gz → 1.9.1__tar.gz - Mend

blksprs 1.8.3tar.gz → 1.9.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{blksprs-1.8.3 → blksprs-1.9.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.8.3
+Version: 1.9.1
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -64,8 +64,12 @@ Further helpful operations (included in the ``bs.ops.misc`` module) that do **no
 - Row-wise sum, max, addition, and subtraction
 - Broadcast addition and subtraction between slices
-Furthermore, the library provides a set of utility functions for the creation of sparsity layouts based on existing
-dense tensors and for the scatter operation (module ``bs.layouting``), as well as utility functions to apply linear layers,
+Furthermore, the library provides a set of utility functions
+- for the creation of sparsity layouts based on existing
+dense tensors and for the scatter operation (module ``bs.layouting``),
+- for the application of ``nn.Linear``, ``nn.Dropout``, and ``nn.LayerNorm`` layers to block-sparse tensors,
+- as well as utility functions to apply linear layers,
 ensure correct input dimensionality, and validate input (module ``bs.utils``).
 ## Installation

{blksprs-1.8.3 → blksprs-1.9.1}/README.md RENAMED Viewed

@@ -45,8 +45,12 @@ Further helpful operations (included in the ``bs.ops.misc`` module) that do **no
 - Row-wise sum, max, addition, and subtraction
 - Broadcast addition and subtraction between slices
-Furthermore, the library provides a set of utility functions for the creation of sparsity layouts based on existing
-dense tensors and for the scatter operation (module ``bs.layouting``), as well as utility functions to apply linear layers,
+Furthermore, the library provides a set of utility functions
+- for the creation of sparsity layouts based on existing
+dense tensors and for the scatter operation (module ``bs.layouting``),
+- for the application of ``nn.Linear``, ``nn.Dropout``, and ``nn.LayerNorm`` layers to block-sparse tensors,
+- as well as utility functions to apply linear layers,
 ensure correct input dimensionality, and validate input (module ``bs.utils``).
 ## Installation

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 class ops:
     from blksprs.ops.conversion import to_dense, to_sparse, from_blksprs, to_blksprs, adapt_layout
     from blksprs.ops.distribution import gather, scatter, scatter_reduce
@@ -22,13 +23,15 @@ class layouting:
     from blksprs.layouting.distribution_layout import build_distribution_layout
     from blksprs.layouting.sparsity_layout import build_sparsity_layout, build_sparsity_layout_adaption, \
         build_sparsity_layout_matmul, build_sparsity_layout_matmul_fast
+    from blksprs.utils.layout_utils import build_full_sparsity_layout
     class experimental:
         from blksprs.ops.experimental.distribution_mdi import build_distribution_layout_mdi
 class utils:
-    from blksprs.utils.processing import apply_torch_linear
+    from blksprs.utils.processing import apply_torch_linear, apply_torch_normalisation, apply_torch_dropout, \
+        apply_function_applicable_row_wise
     from blksprs.utils.tools import do_shape_blocksparse, undo_shape_blocksparse
     from blksprs.utils.validation import disable_validation

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/layouting/distribution_layout.py RENAMED Viewed

@@ -10,13 +10,14 @@ from blksprs.utils.validation import validate_triton_block_size, validate_dimens
 def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: Tensor,
-                              size_target: torch.Size,
+                              dim: int, size_target: torch.Size,
                               sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
     """Builds the sparsity layout of either the source of a gather or the target of a scatter operation.
     Args:
         indices (BlksprsTensor): The block-sparse indices tensor in compressed form used for the gather or scatter operation.
         sparsity_layout_indices (Tensor): The sparsity layout of the indices block-sparse tensor.
+        dim (int): The dimension along which the operation is conducted.
         size_target (torch.Size): The size of the block-sparse target tensor in regular form.
         sparsity_block_size (int): The size of the sparsity blocks.
         triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
@@ -31,6 +32,8 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
     sparsity_lut_i = torch.nonzero(sparsity_layout_indices).contiguous()
+    adjusted_dim = dim % 3
     output = torch.zeros(size_target[0], size_target[1] // sparsity_block_size, size_target[2] // sparsity_block_size,
                          dtype=torch.bool, device=indices.device)
@@ -55,6 +58,7 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
       i_b, i_b_s, i_r_s, i_c_s,
       sparsity_lut_i,
       s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+      adjusted_dim,
       output,
       o_b, o_b_s, o_r_s, o_c_s,
       sparsity_block_size,
@@ -68,6 +72,7 @@ def kernel_distribution_layout(i,
                                i_b, i_b_s, i_r_s, i_c_s,
                                s_lut_i,
                                s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+                               dim,
                                o,
                                o_b, o_b_s, o_r_s, o_c_s,
                                sparsity_block_size,
@@ -86,17 +91,30 @@ def kernel_distribution_layout(i,
     spa_row_i_msk = (spa_row_i_idx < s_lut_i_r * s_lut_i_r_s)
     spa_row_i = tl.load(s_lut_i + spa_row_i_idx, mask=spa_row_i_msk)
+    spa_col_i_idx = (pid_blk * s_lut_i_r_s + 2 * s_lut_i_c_s)
+    spa_col_i_msk = (spa_col_i_idx < s_lut_i_r * s_lut_i_r_s)
+    spa_col_i = tl.load(s_lut_i + spa_col_i_idx, mask=spa_col_i_msk)
     blk_i_idx = (pid_blk * i_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
     blk_i_msk = (blk_i_idx < i_b * i_b_s)
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk)
-    blk_i = blk_i // sparsity_block_size
+    dst_bat_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_i, dtype=tl.int32)
+    dst_row_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_i, dtype=tl.int32)
+    dst_col_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_i, dtype=tl.int32)
+    if dim == 0:
+        dst_bat_idx = blk_i
+    elif dim == 1:
+        dst_row_idx = blk_i // sparsity_block_size
+    elif dim == 2:
+        dst_col_idx = blk_i // sparsity_block_size
     blk_v = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), 1, dtype=tl.int32)
-    blk_o_idx = ((spa_bat_i * o_b_s) +
-                 (spa_row_i * o_r_s) +
-                 (blk_i * o_c_s))
+    blk_o_idx = ((dst_bat_idx * o_b_s) +
+                 (dst_row_idx * o_r_s) +
+                 (dst_col_idx * o_c_s))
     blk_o_msk = (blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, blk_v, mask=blk_o_msk)

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/ops/conversion.py RENAMED Viewed

@@ -289,8 +289,8 @@ class _BlocksparseToSparse(torch.autograd.Function):
 def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_size_from: int,
-                 sparsity_block_size_to: int,
-                 preprocess_data: dict = None, triton_block_size: int = None) -> BlksprsTensor:
+                 sparsity_block_size_to: int, sparsity_layout_to: Tensor = None,
+                 triton_block_size: int = None) -> (BlksprsTensor, Tensor):
     """Adapts the sparsity layout of a block-sparse tensor, resulting in a new block-sparse tensor in compressed form
         conforming to the new sparsity layout (and sparsity block size) definition.
@@ -299,11 +299,12 @@ def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_
         sparsity_layout_from (Tensor): The sparsity layout of the input block-sparse tensor.
         sparsity_block_size_from (int): The size of the sparsity blocks of the input sparsity layout.
         sparsity_block_size_to (int): The size of the sparsity blocks of the output sparsity layout.
-        preprocess_data (dict): A dictionary containing data otherwise computed by the function (default ``None``).
+        sparsity_layout_to (Tensor): The sparsity layout of the output block-sparse tensor (default ``None``).
         triton_block_size (int): The block size to use for the triton kernel (default ``None``).
     Returns:
         BlksprsTensor: The block-sparse tensor in compressed form with the adapted sparsity layout and sparsity block size.
+        Tensor: The sparsity layout of the resulting output tensor.
     """
     x = x.contiguous()
@@ -317,52 +318,42 @@ def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_
     min_sparsity_block_size = min(sparsity_block_size_from, sparsity_block_size_to)
     validate_triton_block_size(triton_block_size, min_sparsity_block_size)
-    if preprocess_data is None:
-        preprocess_data = {}
+    sparsity_layout_from_flat = sparsity_layout_from.reshape(-1)
+    sparsity_reverse_lut_from = ((torch.cumsum(sparsity_layout_from_flat, dim=-1) - 1) *
+                                 (sparsity_layout_from_flat == 1) -
+                                 (1 * (sparsity_layout_from_flat == 0)))
-    if "sparsity_reverse_lut_from" not in preprocess_data:
-        sparsity_layout_from_flat = sparsity_layout_from.reshape(-1)
-        sparsity_reverse_lut_from = ((torch.cumsum(sparsity_layout_from_flat, dim=-1) - 1) *
-                                     (sparsity_layout_from_flat == 1) -
-                                     (1 * (sparsity_layout_from_flat == 0)))
-    else:
-        sparsity_reverse_lut_from = preprocess_data["sparsity_reverse_lut_from"]
-    if "sparsity_layout_to" not in preprocess_data:
+    if sparsity_layout_to is None:
         sparsity_layout_to = build_sparsity_layout_adaption(x, sparsity_layout_from,
                                                             sparsity_block_size_from, sparsity_block_size_to,
                                                             triton_block_size)
-    else:
-        sparsity_layout_to = preprocess_data["sparsity_layout_to"]
-    if "sparsity_lut_to" not in preprocess_data:
-        sparsity_lut_to = torch.nonzero(sparsity_layout_to).contiguous()
-    else:
-        sparsity_lut_to = preprocess_data["sparsity_lut_to"]
+    sparsity_lut_to = torch.nonzero(sparsity_layout_to).contiguous()
-    if "n_sparse_blocks_to" not in preprocess_data:
-        n_sparse_blocks_to = torch.sum(sparsity_layout_to.to(torch.int)).item()
-    else:
-        n_sparse_blocks_to = preprocess_data["n_sparse_blocks_to"]
+    n_sparse_blocks_to = torch.sum(sparsity_layout_to.to(torch.int)).item()
-    validate_contiguous(sparsity_layout_to, sparsity_reverse_lut_from, sparsity_lut_to)
+    validate_contiguous(sparsity_reverse_lut_from, sparsity_layout_to, sparsity_lut_to)
     if (sparsity_block_size_from == sparsity_block_size_to) and torch.equal(sparsity_layout_from, sparsity_layout_to):
-        return BlksprsTensor(x)
+        return BlksprsTensor(x), sparsity_layout_to
     return BlksprsTensor(_BlocksparseAdaptLayout.apply(x,
                                                        sparsity_layout_from, sparsity_reverse_lut_from,
                                                        sparsity_block_size_from,
-                                                       sparsity_layout_to, sparsity_lut_to, sparsity_block_size_to,
-                                                       n_sparse_blocks_to, min_sparsity_block_size, triton_block_size))
+                                                       sparsity_layout_to, sparsity_lut_to,
+                                                       sparsity_block_size_to,
+                                                       n_sparse_blocks_to, min_sparsity_block_size,
+                                                       triton_block_size)), sparsity_layout_to
 class _BlocksparseAdaptLayout(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor,
-                sparsity_layout_from: Tensor, sparsity_reverse_lut_from: Tensor, sparsity_block_size_from: int,
-                sparsity_layout_to: Tensor, sparsity_lut_to: Tensor, sparsity_block_size_to: int,
+                sparsity_layout_from: Tensor, sparsity_reverse_lut_from: Tensor,
+                sparsity_block_size_from: int,
+                sparsity_layout_to: Tensor, sparsity_lut_to: Tensor,
+                sparsity_block_size_to: int,
                 n_sparse_blocks_to: int, min_sparsity_block_size: int, triton_block_size: int) -> Tensor:
         output = torch.zeros(size=(n_sparse_blocks_to, sparsity_block_size_to, sparsity_block_size_to),
                              dtype=x.dtype, device=x.device)
@@ -409,9 +400,10 @@ class _BlocksparseAdaptLayout(torch.autograd.Function):
         sparsity_block_size_to = ctx.sparsity_block_size_to
         triton_block_size = ctx.triton_block_size
-        return adapt_layout(grad_output, sparsity_layout_to, sparsity_block_size_to, sparsity_block_size_from,
-                            preprocess_data={"sparsity_layout_to": sparsity_layout_from},
-                            triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None
+        return adapt_layout(
+            grad_output, sparsity_layout_to, sparsity_block_size_to, sparsity_block_size_from,
+            sparsity_layout_to=sparsity_layout_from,
+            triton_block_size=triton_block_size)[0], None, None, None, None, None, None, None, None, None
     @staticmethod
     @triton.jit
@@ -448,7 +440,7 @@ class _BlocksparseAdaptLayout(torch.autograd.Function):
         spa_row_x = (spa_row_o * sparsity_block_size_to + pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size_from
         spa_col_x = (spa_col_o * sparsity_block_size_to + pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size_from
-        # # Get reverse sparsity indices for x
+        # Get reverse sparsity indices for x
         rev_idx_spa_x_idx = (spa_bat_x * s_l_x_b_s +
                              spa_row_x * s_l_x_r_s +
                              spa_col_x * s_l_x_c_s)

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/ops/distribution.py RENAMED Viewed

@@ -3,19 +3,23 @@ import triton
 from torch import Tensor
 from triton import language as tl
+from blksprs.ops.conversion import to_dense
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import get_triton_block_size, stride
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_dtype_int, validate_sparsity_block_size, validate_triton_block_size
-def gather(src: BlksprsTensor, sparsity_layout_src: Tensor, idx: BlksprsTensor, sparsity_layout_idx: Tensor,
+def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
+           dim: int,
+           idx: BlksprsTensor, sparsity_layout_idx: Tensor,
            sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
     """Applies a gather operation on a block-sparse tensor in compressed form.
     Args:
         src (BlksprsTensor): The source block-sparse tensor in compressed form to gather from.
         sparsity_layout_src (Tensor): The sparsity layout of the source block-sparse tensor.
+        dim (int): The dimension along which to gather.
         idx (BlksprsTensor): The block-sparse indices tensor in compressed form specifying how to gather from the source tensor.
         sparsity_layout_idx (Tensor): The sparsity layout of the indices block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
@@ -46,16 +50,18 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor, idx: BlksprsTensor,
     validate_contiguous(sparsity_layout_src, sparsity_reverse_lut_x,
                         sparsity_layout_idx, sparsity_lut_i)
+    adjusted_dim = dim % 3
     return BlksprsTensor(_BlocksparseGather.apply(src, sparsity_layout_src, sparsity_reverse_lut_x,
-                                    idx, sparsity_layout_idx, sparsity_lut_i,
-                                    sparsity_block_size, triton_block_size))
+                                                  adjusted_dim, idx, sparsity_layout_idx, sparsity_lut_i,
+                                                  sparsity_block_size, triton_block_size))
 class _BlocksparseGather(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_x: Tensor, sparsity_reverse_lut_x: Tensor,
-                i: Tensor, sparsity_layout_i: Tensor, sparsity_lut_i: Tensor,
+                dim: int, i: Tensor, sparsity_layout_i: Tensor, sparsity_lut_i: Tensor,
                 sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
         output = torch.empty_like(i, dtype=x.dtype)
@@ -82,6 +88,7 @@ class _BlocksparseGather(torch.autograd.Function):
           x_b, x_b_s, x_r_s, x_c_s,
           s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
           sparsity_reverse_lut_x,
+          dim,
           i,
           i_b, i_b_s, i_r_s, i_c_s,
           output,
@@ -91,6 +98,7 @@ class _BlocksparseGather(torch.autograd.Function):
           triton_block_size))
         ctx.save_for_backward(sparsity_layout_x, i, sparsity_layout_i)
+        ctx.dim = dim
         ctx.sparsity_block_size = sparsity_block_size
         ctx.triton_block_size = triton_block_size
@@ -99,15 +107,15 @@ class _BlocksparseGather(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
         sparsity_layout_x, i, sparsity_layout_i = ctx.saved_tensors
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         triton_block_size = ctx.triton_block_size
         return scatter_reduce(grad_output, sparsity_layout_i,
-                              i,
-                              sparsity_layout_x,
-                              sparsity_block_size,
+                              dim, i,
+                              sparsity_layout_x, sparsity_block_size,
                               reduce_op="sum",
-                              triton_block_size=triton_block_size), None, None, None, None, None, None, None
+                              triton_block_size=triton_block_size), None, None, None, None, None, None, None, None
     @staticmethod
     @triton.jit
@@ -115,6 +123,7 @@ class _BlocksparseGather(torch.autograd.Function):
                                   x_b, x_b_s, x_r_s, x_c_s,
                                   s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
                                   r_lut_x,
+                                  dim,
                                   i,
                                   i_b, i_b_s, i_r_s, i_c_s,
                                   o,
@@ -136,6 +145,10 @@ class _BlocksparseGather(torch.autograd.Function):
         spa_row_o_msk = (spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
         spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
+        spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
+        spa_col_o_msk = (spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
         # Load index values
         blk_i_idx = ((pid_blk * i_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
@@ -143,33 +156,50 @@ class _BlocksparseGather(torch.autograd.Function):
         blk_i_msk = (blk_i_idx < i_b * i_b_s)
         blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
-        # Get positions of sparsity blocks
+        # Get indices of sparsity blocks and positions within the blocks
         pos_spa_blk_x = blk_i // sparsity_block_size
-        pos_spa_col_x = blk_i % sparsity_block_size
+        pos_spa_int_x = blk_i % sparsity_block_size
+        rev_dst_bat_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_o, dtype=tl.int32)
+        rev_dst_row_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_o, dtype=tl.int32)
+        rev_dst_col_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_o, dtype=tl.int32)
+        dst_row_x = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        dst_col_x = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        if dim == 0:
+            rev_dst_bat_x = blk_i
+        elif dim == 1:
+            rev_dst_row_x = pos_spa_blk_x
+            dst_row_x = pos_spa_int_x * x_r_s
+        elif dim == 2:
+            rev_dst_col_x = pos_spa_blk_x
+            dst_col_x = pos_spa_int_x * x_c_s
         # Load reverse sparsity indices for x
-        rev_idx_spa_x_idx = ((spa_bat_o * s_l_x_b_s) +
-                             (spa_row_o * s_l_x_r_s) +
-                             (pos_spa_blk_x * s_l_x_c_s))
+        rev_idx_spa_x_idx = ((rev_dst_bat_x * s_l_x_b_s) +
+                             (rev_dst_row_x * s_l_x_r_s) +
+                             (rev_dst_col_x * s_l_x_c_s))
         rev_idx_spa_x_msk = (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
         rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
         # Load x values
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     (pos_spa_col_x * x_c_s))
-        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+                     dst_row_x +
+                     dst_col_x)
+        blk_x_msk = ((blk_x_idx < x_b * x_b_s) & rev_idx_spa_x_msk != -1)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Store output
         blk_o_idx = ((pid_blk * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx < o_b * o_b_s) &  rev_idx_spa_x_msk != -1)
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
 def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
+            dim: int,
             idx: BlksprsTensor,
             sparsity_layout_tgt: Tensor,
             sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
@@ -177,6 +207,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
     """
     return scatter_reduce(src, sparsity_layout_src,
+                          dim,
                           idx,
                           sparsity_layout_tgt,
                           sparsity_block_size,
@@ -184,6 +215,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
+                   dim: int,
                    idx: BlksprsTensor,
                    sparsity_layout_tgt: Tensor,
                    sparsity_block_size: int,
@@ -193,6 +225,7 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
     Args:
         src (BlksprsTensor): The source block-sparse tensor in compressed form to scatter from.
         sparsity_layout_src (Tensor): The sparsity layout of the source block-sparse tensor.
+        dim (int): The dimension along which to scatter.
         idx (BlksprsTensor): The block-sparse indices tensor in compressed form specifying how to scatter to the target tensor.
         sparsity_layout_tgt (Tensor): The sparsity layout of the target block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
@@ -230,18 +263,20 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
     validate_contiguous(sparsity_layout_src, sparsity_lut_x,
                         sparsity_layout_tgt, sparsity_reverse_lut_o)
+    adjusted_dim = dim % 3
     return BlksprsTensor(_BlocksparseScatterReduce.apply(src, sparsity_layout_src, sparsity_lut_x,
-                                           idx,
-                                           sparsity_layout_tgt, sparsity_reverse_lut_o,
-                                           sparsity_block_size, n_sparse_blocks,
-                                           reduce_op, triton_block_size))
+                                                         adjusted_dim, idx,
+                                                         sparsity_layout_tgt, sparsity_reverse_lut_o,
+                                                         sparsity_block_size, n_sparse_blocks,
+                                                         reduce_op, triton_block_size))
 class _BlocksparseScatterReduce(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_x: Tensor, sparsity_lut_x: Tensor,
-                i: Tensor,
+                dim: int, i: Tensor,
                 sparsity_layout_o: Tensor, sparsity_reverse_lut_o: Tensor,
                 sparsity_block_size: int, n_sparse_blocks: int,
                 reduce_op: str, triton_block_size: int) -> Tensor:
@@ -274,10 +309,11 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
          (x,
           x_b, x_b_s, x_r_s, x_c_s,
           sparsity_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          dim,
           i,
           i_b, i_b_s, i_r_s, i_c_s,
           output,
-          o_b, o_b_s, o_r_s, o_c_s,
+          o_b, o_b_s,
           s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
           sparsity_reverse_lut_o,
           reduce_op_ind,
@@ -285,6 +321,7 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
           triton_block_size))
         ctx.save_for_backward(sparsity_layout_x, i, sparsity_layout_o)
+        ctx.dim = dim
         ctx.sparsity_block_size = sparsity_block_size
         ctx.reduce_op = reduce_op
         ctx.triton_block_size = triton_block_size
@@ -294,13 +331,14 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
         sparsity_layout_x, i, sparsity_layout_o = ctx.saved_tensors
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         reduce_op = ctx.reduce_op
         triton_block_size = ctx.triton_block_size
         if reduce_op == "sum":
-            return gather(grad_output, sparsity_layout_o, i, sparsity_layout_x, sparsity_block_size,
-                          triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None
+            return gather(grad_output, sparsity_layout_o, dim, i, sparsity_layout_x, sparsity_block_size,
+                          triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None, None
         else:
             raise ValueError(f"Reduction operation '{reduce_op}' does not support backward pass")
@@ -309,10 +347,11 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
     def kernel_blocksparse_scatter(x,
                                    x_b, x_b_s, x_r_s, x_c_s,
                                    s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+                                   dim,
                                    i,
                                    i_b, i_b_s, i_r_s, i_c_s,
                                    o,
-                                   o_b, o_b_s, o_r_s, o_c_s,
+                                   o_b, o_b_s,
                                    s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
                                    r_lut_o,
                                    reduce_op_ind,
@@ -332,6 +371,10 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
         spa_row_x_msk = (spa_row_x_idx < s_lut_x_r * s_lut_x_r_s)
         spa_row_x = tl.load(s_lut_x + spa_row_x_idx, mask=spa_row_x_msk)
+        spa_col_x_idx = (pid_blk * s_lut_x_r_s + 2 * s_lut_x_c_s)
+        spa_col_x_msk = (spa_col_x_idx < s_lut_x_r * s_lut_x_r_s)
+        spa_col_x = tl.load(s_lut_x + spa_col_x_idx, mask=spa_col_x_msk)
         # Load x values
         blk_x_idx = ((pid_blk * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
@@ -346,22 +389,38 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
         blk_i_msk = (blk_i_idx < i_b * i_b_s)
         blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
-        # Get positions of sparsity blocks
-        pos_spa_blk_o = blk_i // sparsity_block_size
-        pos_spa_col_o = blk_i % sparsity_block_size
+        # Get indices of sparsity blocks and positions within the blocks
+        pos_spa_blk_x = blk_i // sparsity_block_size
+        pos_spa_int_x = blk_i % sparsity_block_size
+        rev_dst_bat_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_x, dtype=tl.int32)
+        rev_dst_row_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_x, dtype=tl.int32)
+        rev_dst_col_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_x, dtype=tl.int32)
+        dst_row_o = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        dst_col_o = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        if dim == 0:
+            rev_dst_bat_o = blk_i
+        elif dim == 1:
+            rev_dst_row_o = pos_spa_blk_x
+            dst_row_o = pos_spa_int_x * x_r_s
+        elif dim == 2:
+            rev_dst_col_o = pos_spa_blk_x
+            dst_col_o = pos_spa_int_x * x_c_s
         # Load reverse sparsity indices for o
-        rev_idx_spa_o_idx = ((spa_bat_x * s_l_o_b_s) +
-                             (spa_row_x * s_l_o_r_s) +
-                             (pos_spa_blk_o * s_l_o_c_s))
+        rev_idx_spa_o_idx = ((rev_dst_bat_o * s_l_o_b_s) +
+                             (rev_dst_row_o * s_l_o_r_s) +
+                             (rev_dst_col_o * s_l_o_c_s))
         rev_idx_spa_o_msk = (rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
         rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
         # Store output
         blk_o_idx = ((rev_idx_spa_o * o_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     (pos_spa_col_o * o_c_s))
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+                     dst_row_o +
+                     dst_col_o)
+        blk_o_msk = ((blk_o_idx < o_b * o_b_s) & rev_idx_spa_o_msk != -1)
         if reduce_op_ind == 0:
             tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/ops/experimental/distribution_mdi.py RENAMED Viewed

@@ -153,6 +153,10 @@ class _BlocksparseGatherMDI(torch.autograd.Function):
         rev_idx_spa_x_msk = (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
         rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
+        if rev_idx_spa_x == -1:
+            tl.device_assert(False)
+            return
         # Load x values
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
@@ -342,6 +346,10 @@ class _BlocksparseScatterReduceMDI(torch.autograd.Function):
         rev_idx_spa_o_msk = (rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
         rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
+        if rev_idx_spa_o == -1:
+            tl.device_assert(False)
+            return
         # Store output
         blk_o_idx = ((rev_idx_spa_o * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +

blksprs-1.9.1/blksprs/ops/flow.py ADDED Viewed

@@ -0,0 +1,147 @@
+import torch
+import triton
+from torch import Tensor
+from triton import language as tl
+from blksprs.utils.tools import stride, get_triton_block_size
+@triton.jit
+def kernel_blocksparse_flow_pull(x,
+                                 x_b, x_b_s, x_r_s, x_c_s,
+                                 o,
+                                 o_b, o_b_s, o_r_s, o_c_s,
+                                 s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
+                                 s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                 r_lut,
+                                 TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+    # Get triton block indices
+    pid_blk = tl.program_id(axis=0)
+    pid_row = tl.program_id(axis=1)
+    pid_col = tl.program_id(axis=2)
+    # Get sparsity index of current output block consisting of its batch, row, and column index
+    spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
+    spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+    spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
+    spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
+    spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+    spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
+    spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
+    spa_col_msk = (spa_col_idx < s_lut_r * s_lut_r_s)
+    spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
+    # Get reverse sparsity index
+    rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
+                       spa_row * s_l_o_r_s +
+                       spa_col * s_l_o_c_s)
+    rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
+    rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
+    blk_x_idx = (rev_idx_spa * x_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx < x_b * x_b_s)
+    blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+    blk_o_idx = (pid_blk * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+    blk_o_msk = (blk_o_idx < o_b * o_b_s)
+    tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
+@triton.jit
+def kernel_blocksparse_flow_push(x,
+                                 x_b, x_b_s, x_r_s, x_c_s,
+                                 s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
+                                 s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                 r_lut,
+                                 o,
+                                 o_b, o_b_s, o_r_s, o_c_s,
+                                 TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+    # Get triton block indices
+    pid_blk = tl.program_id(axis=0)
+    pid_row = tl.program_id(axis=1)
+    pid_col = tl.program_id(axis=2)
+    # Get sparsity index of current input block consisting of its batch, row, and column index
+    spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
+    spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+    spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
+    spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
+    spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+    spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
+    spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
+    spa_col_msk = (spa_col_idx < s_lut_r * s_lut_r_s)
+    spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
+    # Get reverse sparsity index
+    rev_idx_spa_idx = (spa_bat * s_l_x_b_s +
+                       spa_row * s_l_x_r_s +
+                       spa_col * s_l_x_c_s)
+    rev_idx_spa_msk = (rev_idx_spa_idx < s_l_x_b * s_l_x_b_s)
+    rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
+    blk_x_idx = (pid_blk * x_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx < x_b * x_b_s)
+    blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+    blk_o_idx = (rev_idx_spa * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+    blk_o_msk = (blk_o_idx < o_b * o_b_s)
+    tl.atomic_add(o + blk_o_idx, blk_x, mask=blk_o_msk)
+def flow_forward(ctx, x: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
+                 sparsity_block_size: int, n_sparse_blocks: int, triton_block_size: int) -> Tensor:
+    output = torch.empty(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+                         dtype=x.dtype, device=x.device)
+    output = torch.zeros_like(output)
+    x_b, x_r, x_c = x.size()
+    x_b_s, x_r_s, x_c_s = stride(x)
+    o_b, o_r, o_c = output.size()
+    o_b_s, o_r_s, o_c_s = stride(output)
+    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_o.size()
+    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_o)
+    s_lut_r, s_lut_c = sparsity_lut.size()
+    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
+    if triton_block_size is None:
+        triton_block_size = get_triton_block_size(sparsity_block_size)
+    triton_grid = lambda meta: [o_b,
+                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+    (kernel_blocksparse_flow_pull[triton_grid]
+     (x,
+      x_b, x_b_s, x_r_s, x_c_s,
+      output,
+      o_b, o_b_s, o_r_s, o_c_s,
+      s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
+      sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+      sparsity_reverse_lut,
+      triton_block_size))
+    # Save for backward pass
+    ctx.sparsity_block_size = sparsity_block_size
+    ctx.triton_block_size = triton_block_size
+    return output

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/ops/misc/row_wise.py RENAMED Viewed

@@ -117,6 +117,10 @@ def kernel_blocksparse_row_wise_sum(x,
     rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
@@ -240,6 +244,10 @@ def kernel_blocksparse_row_wise_max(x,
     rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/ops/partitioning.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import torch
 from torch import Tensor
-from blksprs.ops.repeat import forward_flow
+from blksprs.ops.flow import flow_forward
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
@@ -66,7 +66,7 @@ class _BlocksparseSplit(torch.autograd.Function):
         ctx.save_for_backward(sparsity_layout_o)
         ctx.num_partitions = num_partitions
-        return forward_flow(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+        return flow_forward(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
                             n_sparse_blocks, triton_block_size)
     @staticmethod
@@ -140,7 +140,7 @@ class _BlocksparseMerge(torch.autograd.Function):
         ctx.save_for_backward(sparsity_layout_o)
         ctx.num_partitions = num_partitions
-        return forward_flow(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+        return flow_forward(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
                             n_sparse_blocks, triton_block_size)
     @staticmethod

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/ops/repeat.py RENAMED Viewed

@@ -1,8 +1,8 @@
 import torch
 import triton
-from triton import language as tl
 from torch import Tensor
+from blksprs.ops.flow import kernel_blocksparse_flow_push, flow_forward
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import get_triton_block_size, stride
 from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
@@ -64,8 +64,9 @@ def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int,
     validate_contiguous(sparsity_layout_o, sparsity_lut, sparsity_reverse_lut)
-    return BlksprsTensor(_BlocksparseRepeat.apply(x, sparsity_layout_x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut,
-                                    sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_o
+    return BlksprsTensor(
+        _BlocksparseRepeat.apply(x, sparsity_layout_x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut,
+                                 sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_o
 def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
@@ -122,8 +123,9 @@ def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
     validate_contiguous(sparsity_layout_o, sparsity_lut, sparsity_reverse_lut)
-    return BlksprsTensor(_BlocksparseRepeat.apply(x, sparsity_layout_x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut,
-                                    sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_o
+    return BlksprsTensor(
+        _BlocksparseRepeat.apply(x, sparsity_layout_x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut,
+                                 sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_o
 class _BlocksparseRepeat(torch.autograd.Function):
@@ -137,7 +139,7 @@ class _BlocksparseRepeat(torch.autograd.Function):
         ctx.x_size = x.size()
         ctx.x_stride = stride(x)
-        return forward_flow(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+        return flow_forward(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
                             n_sparse_blocks, triton_block_size)
     @staticmethod
@@ -180,144 +182,3 @@ class _BlocksparseRepeat(torch.autograd.Function):
           triton_block_size))
         return output, None, None, None, None, None, None, None
-@triton.jit
-def kernel_blocksparse_flow_pull(x,
-                                 x_b, x_b_s, x_r_s, x_c_s,
-                                 o,
-                                 o_b, o_b_s, o_r_s, o_c_s,
-                                 s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
-                                 s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-                                 r_lut,
-                                 TRITON_BLOCK_SIZE: tl.constexpr) -> None:
-    # Get triton block indices
-    pid_blk = tl.program_id(axis=0)
-    pid_row = tl.program_id(axis=1)
-    pid_col = tl.program_id(axis=2)
-    # Get sparsity index of current output block consisting of its batch, row, and column index
-    spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-    spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
-    spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
-    spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-    spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
-    spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
-    spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
-    spa_col_msk = (spa_col_idx < s_lut_r * s_lut_r_s)
-    spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
-    # Get reverse sparsity index
-    rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
-                       spa_row * s_l_o_r_s +
-                       spa_col * s_l_o_c_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
-    rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
-    if rev_idx_spa == -1:
-        tl.device_assert(False)
-        return
-    blk_x_idx = (rev_idx_spa * x_b_s +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx < x_b * x_b_s)
-    blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
-    blk_o_idx = (pid_blk * o_b_s +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx < o_b * o_b_s)
-    tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
-@triton.jit
-def kernel_blocksparse_flow_push(x,
-                                 x_b, x_b_s, x_r_s, x_c_s,
-                                 s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
-                                 s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-                                 r_lut,
-                                 o,
-                                 o_b, o_b_s, o_r_s, o_c_s,
-                                 TRITON_BLOCK_SIZE: tl.constexpr) -> None:
-    # Get triton block indices
-    pid_blk = tl.program_id(axis=0)
-    pid_row = tl.program_id(axis=1)
-    pid_col = tl.program_id(axis=2)
-    # Get sparsity index of current input block consisting of its batch, row, and column index
-    spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-    spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
-    spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
-    spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-    spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
-    spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
-    spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
-    spa_col_msk = (spa_col_idx < s_lut_r * s_lut_r_s)
-    spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
-    # Get reverse sparsity index
-    rev_idx_spa_idx = (spa_bat * s_l_x_b_s +
-                       spa_row * s_l_x_r_s +
-                       spa_col * s_l_x_c_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx < s_l_x_b * s_l_x_b_s)
-    rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
-    if rev_idx_spa == -1:
-        tl.device_assert(False)
-        return
-    blk_x_idx = (pid_blk * x_b_s +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx < x_b * x_b_s)
-    blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
-    blk_o_idx = (rev_idx_spa * o_b_s +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx < o_b * o_b_s)
-    tl.atomic_add(o + blk_o_idx, blk_x, mask=blk_o_msk)
-def forward_flow(ctx, x: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
-                 sparsity_block_size: int, n_sparse_blocks: int, triton_block_size: int) -> Tensor:
-    output = torch.empty(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
-                         dtype=x.dtype, device=x.device)
-    output = torch.zeros_like(output)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_o.size()
-    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_o)
-    s_lut_r, s_lut_c = sparsity_lut.size()
-    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_blocksparse_flow_pull[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
-      sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-      sparsity_reverse_lut,
-      triton_block_size))
-    # Save for backward pass
-    ctx.sparsity_block_size = sparsity_block_size
-    ctx.triton_block_size = triton_block_size
-    return output

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/ops/softmax.py RENAMED Viewed

@@ -238,6 +238,10 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
         rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
+        if rev_idx_spa_s == -1:
+            tl.device_assert(False)
+            return
         blk_s_idx = (rev_idx_spa_s * s_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])

blksprs-1.9.1/blksprs/utils/layout_utils.py ADDED Viewed

@@ -0,0 +1,17 @@
+import math
+import torch
+import triton
+from torch import Tensor
+from torch.xpu import device
+from triton import language as tl
+from blksprs.utils.blksprs_tensor import BlksprsTensor
+from blksprs.utils.tools import get_triton_block_size, stride
+from blksprs.utils.validation import validate_triton_block_size, validate_dimensions, validate_device, \
+    validate_contiguous, validate_sparsity, validate_sparsity_block_size
+def build_full_sparsity_layout(x: Tensor, sparsity_block_size: int) -> Tensor:
+    return torch.ones(size=(x.size(0), x.size(1) // sparsity_block_size, x.size(2) // sparsity_block_size),
+                      dtype=torch.bool, device=x.device)

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/utils/processing.py RENAMED Viewed

@@ -1,7 +1,9 @@
+from collections.abc import Callable
 import torch
 from torch import Tensor, nn
-from triton.language import dtype
+import blksprs as bs
 from blksprs.layouting.sparsity_layout import build_sparsity_layout_matmul_fast
 from blksprs.ops.conversion import to_sparse
 from blksprs.ops.matmul import matmul
@@ -10,7 +12,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
 def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
-                       linear: nn.Linear) -> (BlksprsTensor, Tensor):
+                       linear: nn.Linear, bias: nn.Parameter = None) -> (BlksprsTensor, Tensor):
     # Extract weight and bias
     w = linear.weight
     b = linear.bias
@@ -27,6 +29,8 @@ def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block
     interim = xw
     # Apply bias
+    if bias is not None:
+        b = bias
     if b is not None:
         b_slice = b.unsqueeze(0).unsqueeze(0).repeat(1, sparsity_block_size, 1)
         sparsity_layout_b_slice = torch.ones(size=(1, b_slice.size(1) // sparsity_block_size,
@@ -39,3 +43,32 @@ def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block
         interim = interim + b_bs
     return interim, sparsity_layout_xw
+def apply_torch_normalisation(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
+                              normalisation: nn.Module) -> BlksprsTensor:
+    return apply_function_applicable_row_wise(x, sparsity_layout, sparsity_block_size, normalisation)
+def apply_torch_dropout(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
+                        dropout: nn.Dropout) -> BlksprsTensor:
+    return apply_function_applicable_row_wise(x, sparsity_layout, sparsity_block_size, dropout)
+def apply_function_applicable_row_wise(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
+                                       function: Callable) -> BlksprsTensor:
+    sparsity_layout_packed = _pack_layout(sparsity_layout)
+    blksprs_pseudo_dense = bs.ops.to_dense(x, sparsity_layout_packed, sparsity_block_size)
+    normalisation_out = function(blksprs_pseudo_dense)
+    blksprs_sparse = bs.ops.to_sparse(normalisation_out, sparsity_layout_packed, sparsity_block_size)
+    return blksprs_sparse
+def _pack_layout(sparsity_layout: Tensor) -> BlksprsTensor:
+    sparsity_layout_resized = sparsity_layout.resize(1, sparsity_layout.size(0) * sparsity_layout.size(1),
+                                                     sparsity_layout.size(2))
+    non_zero_rows = torch.any(sparsity_layout_resized, dim=-1)
+    sparsity_layout_filtered = sparsity_layout_resized[non_zero_rows].unsqueeze(0)
+    return sparsity_layout_filtered

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs/utils/validation.py RENAMED Viewed

@@ -36,7 +36,8 @@ def validate_dtype_int(*tensors: Tensor) -> None:
         return
     for tensor in tensors:
-        if tensor.dtype != torch.int32 and tensor.dtype != torch.int64:
+        if (tensor.dtype !=
+                torch.int32 and tensor.dtype != torch.int64):
             raise ValueError("Tensor must have int32 or int64 dtype")

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.8.3
+Version: 1.9.1
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -64,8 +64,12 @@ Further helpful operations (included in the ``bs.ops.misc`` module) that do **no
 - Row-wise sum, max, addition, and subtraction
 - Broadcast addition and subtraction between slices
-Furthermore, the library provides a set of utility functions for the creation of sparsity layouts based on existing
-dense tensors and for the scatter operation (module ``bs.layouting``), as well as utility functions to apply linear layers,
+Furthermore, the library provides a set of utility functions
+- for the creation of sparsity layouts based on existing
+dense tensors and for the scatter operation (module ``bs.layouting``),
+- for the application of ``nn.Linear``, ``nn.Dropout``, and ``nn.LayerNorm`` layers to block-sparse tensors,
+- as well as utility functions to apply linear layers,
 ensure correct input dimensionality, and validate input (module ``bs.utils``).
 ## Installation

{blksprs-1.8.3 → blksprs-1.9.1}/blksprs.egg-info/SOURCES.txt RENAMED Viewed

@@ -10,6 +10,7 @@ blksprs/layouting/distribution_layout.py
 blksprs/layouting/sparsity_layout.py
 blksprs/ops/conversion.py
 blksprs/ops/distribution.py
+blksprs/ops/flow.py
 blksprs/ops/matmul.py
 blksprs/ops/partitioning.py
 blksprs/ops/repeat.py
@@ -21,6 +22,7 @@ blksprs/ops/misc/exp.py
 blksprs/ops/misc/row_wise.py
 blksprs/utils/benchmarking.py
 blksprs/utils/blksprs_tensor.py
+blksprs/utils/layout_utils.py
 blksprs/utils/processing.py
 blksprs/utils/tools.py
 blksprs/utils/validation.py

{blksprs-1.8.3 → blksprs-1.9.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "blksprs"
-version = "1.8.3"
+version = "1.9.1"
 authors = [{ name = "Felix Schön", email = "schoen@kr.tuwien.ac.at" }]
 description = "A lightweight library for operations on blocksparse matrices in PyTorch."
 readme = "README.md"