PyPI - blksprs - Versions diffs - 1.8.3__tar.gz → 1.9__tar.gz - Mend

blksprs 1.8.3tar.gz → 1.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{blksprs-1.8.3 → blksprs-1.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.8.3
+Version: 1.9
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs

{blksprs-1.8.3 → blksprs-1.9}/blksprs/layouting/distribution_layout.py RENAMED Viewed

@@ -10,13 +10,14 @@ from blksprs.utils.validation import validate_triton_block_size, validate_dimens
 def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: Tensor,
-                              size_target: torch.Size,
+                              dim: int, size_target: torch.Size,
                               sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
     """Builds the sparsity layout of either the source of a gather or the target of a scatter operation.
     Args:
         indices (BlksprsTensor): The block-sparse indices tensor in compressed form used for the gather or scatter operation.
         sparsity_layout_indices (Tensor): The sparsity layout of the indices block-sparse tensor.
+        dim (int): The dimension along which the operation is conducted.
         size_target (torch.Size): The size of the block-sparse target tensor in regular form.
         sparsity_block_size (int): The size of the sparsity blocks.
         triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
@@ -31,6 +32,8 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
     sparsity_lut_i = torch.nonzero(sparsity_layout_indices).contiguous()
+    adjusted_dim = dim % 3
     output = torch.zeros(size_target[0], size_target[1] // sparsity_block_size, size_target[2] // sparsity_block_size,
                          dtype=torch.bool, device=indices.device)
@@ -55,6 +58,7 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
       i_b, i_b_s, i_r_s, i_c_s,
       sparsity_lut_i,
       s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+      adjusted_dim,
       output,
       o_b, o_b_s, o_r_s, o_c_s,
       sparsity_block_size,
@@ -68,6 +72,7 @@ def kernel_distribution_layout(i,
                                i_b, i_b_s, i_r_s, i_c_s,
                                s_lut_i,
                                s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+                               dim,
                                o,
                                o_b, o_b_s, o_r_s, o_c_s,
                                sparsity_block_size,
@@ -86,17 +91,30 @@ def kernel_distribution_layout(i,
     spa_row_i_msk = (spa_row_i_idx < s_lut_i_r * s_lut_i_r_s)
     spa_row_i = tl.load(s_lut_i + spa_row_i_idx, mask=spa_row_i_msk)
+    spa_col_i_idx = (pid_blk * s_lut_i_r_s + 2 * s_lut_i_c_s)
+    spa_col_i_msk = (spa_col_i_idx < s_lut_i_r * s_lut_i_r_s)
+    spa_col_i = tl.load(s_lut_i + spa_col_i_idx, mask=spa_col_i_msk)
     blk_i_idx = (pid_blk * i_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
     blk_i_msk = (blk_i_idx < i_b * i_b_s)
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk)
-    blk_i = blk_i // sparsity_block_size
+    dst_bat_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_i, dtype=tl.int32)
+    dst_row_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_i, dtype=tl.int32)
+    dst_col_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_i, dtype=tl.int32)
+    if dim == 0:
+        dst_bat_idx = blk_i
+    elif dim == 1:
+        dst_row_idx = blk_i // sparsity_block_size
+    elif dim == 2:
+        dst_col_idx = blk_i // sparsity_block_size
     blk_v = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), 1, dtype=tl.int32)
-    blk_o_idx = ((spa_bat_i * o_b_s) +
-                 (spa_row_i * o_r_s) +
-                 (blk_i * o_c_s))
+    blk_o_idx = ((dst_bat_idx * o_b_s) +
+                 (dst_row_idx * o_r_s) +
+                 (dst_col_idx * o_c_s))
     blk_o_msk = (blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, blk_v, mask=blk_o_msk)

{blksprs-1.8.3 → blksprs-1.9}/blksprs/ops/distribution.py RENAMED Viewed

@@ -3,19 +3,23 @@ import triton
 from torch import Tensor
 from triton import language as tl
+from blksprs.ops.conversion import to_dense
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import get_triton_block_size, stride
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_dtype_int, validate_sparsity_block_size, validate_triton_block_size
-def gather(src: BlksprsTensor, sparsity_layout_src: Tensor, idx: BlksprsTensor, sparsity_layout_idx: Tensor,
+def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
+           dim: int,
+           idx: BlksprsTensor, sparsity_layout_idx: Tensor,
            sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
     """Applies a gather operation on a block-sparse tensor in compressed form.
     Args:
         src (BlksprsTensor): The source block-sparse tensor in compressed form to gather from.
         sparsity_layout_src (Tensor): The sparsity layout of the source block-sparse tensor.
+        dim (int): The dimension along which to gather.
         idx (BlksprsTensor): The block-sparse indices tensor in compressed form specifying how to gather from the source tensor.
         sparsity_layout_idx (Tensor): The sparsity layout of the indices block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
@@ -46,16 +50,18 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor, idx: BlksprsTensor,
     validate_contiguous(sparsity_layout_src, sparsity_reverse_lut_x,
                         sparsity_layout_idx, sparsity_lut_i)
+    adjusted_dim = dim % 3
     return BlksprsTensor(_BlocksparseGather.apply(src, sparsity_layout_src, sparsity_reverse_lut_x,
-                                    idx, sparsity_layout_idx, sparsity_lut_i,
-                                    sparsity_block_size, triton_block_size))
+                                                  adjusted_dim, idx, sparsity_layout_idx, sparsity_lut_i,
+                                                  sparsity_block_size, triton_block_size))
 class _BlocksparseGather(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_x: Tensor, sparsity_reverse_lut_x: Tensor,
-                i: Tensor, sparsity_layout_i: Tensor, sparsity_lut_i: Tensor,
+                dim: int, i: Tensor, sparsity_layout_i: Tensor, sparsity_lut_i: Tensor,
                 sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
         output = torch.empty_like(i, dtype=x.dtype)
@@ -82,6 +88,7 @@ class _BlocksparseGather(torch.autograd.Function):
           x_b, x_b_s, x_r_s, x_c_s,
           s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
           sparsity_reverse_lut_x,
+          dim,
           i,
           i_b, i_b_s, i_r_s, i_c_s,
           output,
@@ -91,6 +98,7 @@ class _BlocksparseGather(torch.autograd.Function):
           triton_block_size))
         ctx.save_for_backward(sparsity_layout_x, i, sparsity_layout_i)
+        ctx.dim = dim
         ctx.sparsity_block_size = sparsity_block_size
         ctx.triton_block_size = triton_block_size
@@ -99,15 +107,15 @@ class _BlocksparseGather(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
         sparsity_layout_x, i, sparsity_layout_i = ctx.saved_tensors
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         triton_block_size = ctx.triton_block_size
         return scatter_reduce(grad_output, sparsity_layout_i,
-                              i,
-                              sparsity_layout_x,
-                              sparsity_block_size,
+                              dim, i,
+                              sparsity_layout_x, sparsity_block_size,
                               reduce_op="sum",
-                              triton_block_size=triton_block_size), None, None, None, None, None, None, None
+                              triton_block_size=triton_block_size), None, None, None, None, None, None, None, None
     @staticmethod
     @triton.jit
@@ -115,6 +123,7 @@ class _BlocksparseGather(torch.autograd.Function):
                                   x_b, x_b_s, x_r_s, x_c_s,
                                   s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
                                   r_lut_x,
+                                  dim,
                                   i,
                                   i_b, i_b_s, i_r_s, i_c_s,
                                   o,
@@ -136,6 +145,10 @@ class _BlocksparseGather(torch.autograd.Function):
         spa_row_o_msk = (spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
         spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
+        spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
+        spa_col_o_msk = (spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
         # Load index values
         blk_i_idx = ((pid_blk * i_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
@@ -143,33 +156,50 @@ class _BlocksparseGather(torch.autograd.Function):
         blk_i_msk = (blk_i_idx < i_b * i_b_s)
         blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
-        # Get positions of sparsity blocks
+        # Get indices of sparsity blocks and positions within the blocks
         pos_spa_blk_x = blk_i // sparsity_block_size
-        pos_spa_col_x = blk_i % sparsity_block_size
+        pos_spa_int_x = blk_i % sparsity_block_size
+        rev_dst_bat_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_o, dtype=tl.int32)
+        rev_dst_row_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_o, dtype=tl.int32)
+        rev_dst_col_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_o, dtype=tl.int32)
+        dst_row_x = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        dst_col_x = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        if dim == 0:
+            rev_dst_bat_x = blk_i
+        elif dim == 1:
+            rev_dst_row_x = pos_spa_blk_x
+            dst_row_x = pos_spa_int_x * x_r_s
+        elif dim == 2:
+            rev_dst_col_x = pos_spa_blk_x
+            dst_col_x = pos_spa_int_x * x_c_s
         # Load reverse sparsity indices for x
-        rev_idx_spa_x_idx = ((spa_bat_o * s_l_x_b_s) +
-                             (spa_row_o * s_l_x_r_s) +
-                             (pos_spa_blk_x * s_l_x_c_s))
+        rev_idx_spa_x_idx = ((rev_dst_bat_x * s_l_x_b_s) +
+                             (rev_dst_row_x * s_l_x_r_s) +
+                             (rev_dst_col_x * s_l_x_c_s))
         rev_idx_spa_x_msk = (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
         rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
         # Load x values
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     (pos_spa_col_x * x_c_s))
-        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+                     dst_row_x +
+                     dst_col_x)
+        blk_x_msk = ((blk_x_idx < x_b * x_b_s) & rev_idx_spa_x_msk != -1)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Store output
         blk_o_idx = ((pid_blk * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx < o_b * o_b_s) &  rev_idx_spa_x_msk != -1)
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
 def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
+            dim: int,
             idx: BlksprsTensor,
             sparsity_layout_tgt: Tensor,
             sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
@@ -184,6 +214,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
+                   dim: int,
                    idx: BlksprsTensor,
                    sparsity_layout_tgt: Tensor,
                    sparsity_block_size: int,
@@ -193,6 +224,7 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
     Args:
         src (BlksprsTensor): The source block-sparse tensor in compressed form to scatter from.
         sparsity_layout_src (Tensor): The sparsity layout of the source block-sparse tensor.
+        dim (int): The dimension along which to scatter.
         idx (BlksprsTensor): The block-sparse indices tensor in compressed form specifying how to scatter to the target tensor.
         sparsity_layout_tgt (Tensor): The sparsity layout of the target block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
@@ -230,18 +262,20 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
     validate_contiguous(sparsity_layout_src, sparsity_lut_x,
                         sparsity_layout_tgt, sparsity_reverse_lut_o)
+    adjusted_dim = dim % 3
     return BlksprsTensor(_BlocksparseScatterReduce.apply(src, sparsity_layout_src, sparsity_lut_x,
-                                           idx,
-                                           sparsity_layout_tgt, sparsity_reverse_lut_o,
-                                           sparsity_block_size, n_sparse_blocks,
-                                           reduce_op, triton_block_size))
+                                                         adjusted_dim, idx,
+                                                         sparsity_layout_tgt, sparsity_reverse_lut_o,
+                                                         sparsity_block_size, n_sparse_blocks,
+                                                         reduce_op, triton_block_size))
 class _BlocksparseScatterReduce(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_x: Tensor, sparsity_lut_x: Tensor,
-                i: Tensor,
+                dim: int, i: Tensor,
                 sparsity_layout_o: Tensor, sparsity_reverse_lut_o: Tensor,
                 sparsity_block_size: int, n_sparse_blocks: int,
                 reduce_op: str, triton_block_size: int) -> Tensor:
@@ -274,10 +308,11 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
          (x,
           x_b, x_b_s, x_r_s, x_c_s,
           sparsity_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          dim,
           i,
           i_b, i_b_s, i_r_s, i_c_s,
           output,
-          o_b, o_b_s, o_r_s, o_c_s,
+          o_b, o_b_s,
           s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
           sparsity_reverse_lut_o,
           reduce_op_ind,
@@ -285,6 +320,7 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
           triton_block_size))
         ctx.save_for_backward(sparsity_layout_x, i, sparsity_layout_o)
+        ctx.dim = dim
         ctx.sparsity_block_size = sparsity_block_size
         ctx.reduce_op = reduce_op
         ctx.triton_block_size = triton_block_size
@@ -294,13 +330,14 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
         sparsity_layout_x, i, sparsity_layout_o = ctx.saved_tensors
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         reduce_op = ctx.reduce_op
         triton_block_size = ctx.triton_block_size
         if reduce_op == "sum":
-            return gather(grad_output, sparsity_layout_o, i, sparsity_layout_x, sparsity_block_size,
-                          triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None
+            return gather(grad_output, sparsity_layout_o, dim, i, sparsity_layout_x, sparsity_block_size,
+                          triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None, None
         else:
             raise ValueError(f"Reduction operation '{reduce_op}' does not support backward pass")
@@ -309,10 +346,11 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
     def kernel_blocksparse_scatter(x,
                                    x_b, x_b_s, x_r_s, x_c_s,
                                    s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+                                   dim,
                                    i,
                                    i_b, i_b_s, i_r_s, i_c_s,
                                    o,
-                                   o_b, o_b_s, o_r_s, o_c_s,
+                                   o_b, o_b_s,
                                    s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
                                    r_lut_o,
                                    reduce_op_ind,
@@ -332,6 +370,10 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
         spa_row_x_msk = (spa_row_x_idx < s_lut_x_r * s_lut_x_r_s)
         spa_row_x = tl.load(s_lut_x + spa_row_x_idx, mask=spa_row_x_msk)
+        spa_col_x_idx = (pid_blk * s_lut_x_r_s + 2 * s_lut_x_c_s)
+        spa_col_x_msk = (spa_col_x_idx < s_lut_x_r * s_lut_x_r_s)
+        spa_col_x = tl.load(s_lut_x + spa_col_x_idx, mask=spa_col_x_msk)
         # Load x values
         blk_x_idx = ((pid_blk * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
@@ -346,22 +388,38 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
         blk_i_msk = (blk_i_idx < i_b * i_b_s)
         blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
-        # Get positions of sparsity blocks
-        pos_spa_blk_o = blk_i // sparsity_block_size
-        pos_spa_col_o = blk_i % sparsity_block_size
+        # Get indices of sparsity blocks and positions within the blocks
+        pos_spa_blk_x = blk_i // sparsity_block_size
+        pos_spa_int_x = blk_i % sparsity_block_size
+        rev_dst_bat_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_x, dtype=tl.int32)
+        rev_dst_row_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_x, dtype=tl.int32)
+        rev_dst_col_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_x, dtype=tl.int32)
+        dst_row_o = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        dst_col_o = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        if dim == 0:
+            rev_dst_bat_o = blk_i
+        elif dim == 1:
+            rev_dst_row_o = pos_spa_blk_x
+            dst_row_o = pos_spa_int_x * x_r_s
+        elif dim == 2:
+            rev_dst_col_o = pos_spa_blk_x
+            dst_col_o = pos_spa_int_x * x_c_s
         # Load reverse sparsity indices for o
-        rev_idx_spa_o_idx = ((spa_bat_x * s_l_o_b_s) +
-                             (spa_row_x * s_l_o_r_s) +
-                             (pos_spa_blk_o * s_l_o_c_s))
+        rev_idx_spa_o_idx = ((rev_dst_bat_o * s_l_o_b_s) +
+                             (rev_dst_row_o * s_l_o_r_s) +
+                             (rev_dst_col_o * s_l_o_c_s))
         rev_idx_spa_o_msk = (rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
         rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
         # Store output
         blk_o_idx = ((rev_idx_spa_o * o_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     (pos_spa_col_o * o_c_s))
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+                     dst_row_o +
+                     dst_col_o)
+        blk_o_msk = ((blk_o_idx < o_b * o_b_s) & rev_idx_spa_o_msk != -1)
         if reduce_op_ind == 0:
             tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

{blksprs-1.8.3 → blksprs-1.9}/blksprs/ops/experimental/distribution_mdi.py RENAMED Viewed

@@ -153,6 +153,10 @@ class _BlocksparseGatherMDI(torch.autograd.Function):
         rev_idx_spa_x_msk = (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
         rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
+        if rev_idx_spa_x == -1:
+            tl.device_assert(False)
+            return
         # Load x values
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
@@ -342,6 +346,10 @@ class _BlocksparseScatterReduceMDI(torch.autograd.Function):
         rev_idx_spa_o_msk = (rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
         rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
+        if rev_idx_spa_o == -1:
+            tl.device_assert(False)
+            return
         # Store output
         blk_o_idx = ((rev_idx_spa_o * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +

{blksprs-1.8.3 → blksprs-1.9}/blksprs/ops/misc/row_wise.py RENAMED Viewed

@@ -117,6 +117,10 @@ def kernel_blocksparse_row_wise_sum(x,
     rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
@@ -240,6 +244,10 @@ def kernel_blocksparse_row_wise_max(x,
     rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])

{blksprs-1.8.3 → blksprs-1.9}/blksprs/ops/softmax.py RENAMED Viewed

@@ -238,6 +238,10 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
         rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
+        if rev_idx_spa_s == -1:
+            tl.device_assert(False)
+            return
         blk_s_idx = (rev_idx_spa_s * s_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])

{blksprs-1.8.3 → blksprs-1.9}/blksprs.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.8.3
+Version: 1.9
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs

{blksprs-1.8.3 → blksprs-1.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "blksprs"
-version = "1.8.3"
+version = "1.9"
 authors = [{ name = "Felix Schön", email = "schoen@kr.tuwien.ac.at" }]
 description = "A lightweight library for operations on blocksparse matrices in PyTorch."
 readme = "README.md"