PyPI - blksprs - Versions diffs - 1.8.3__py3-none-any.whl → 1.9__py3-none-any.whl - Mend

blksprs 1.8.3py3-none-any.whl → 1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

blksprs/layouting/distribution_layout.py +23 -5
blksprs/ops/distribution.py +93 -35
blksprs/ops/experimental/distribution_mdi.py +8 -0
blksprs/ops/misc/row_wise.py +8 -0
blksprs/ops/softmax.py +4 -0
{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/METADATA +1 -1
{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/RECORD +9 -9
{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/WHEEL +0 -0
{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/top_level.txt +0 -0

blksprs/layouting/distribution_layout.py CHANGED Viewed

@@ -10,13 +10,14 @@ from blksprs.utils.validation import validate_triton_block_size, validate_dimens
 def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: Tensor,
-                              size_target: torch.Size,
+                              dim: int, size_target: torch.Size,
                               sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
     """Builds the sparsity layout of either the source of a gather or the target of a scatter operation.
     Args:
         indices (BlksprsTensor): The block-sparse indices tensor in compressed form used for the gather or scatter operation.
         sparsity_layout_indices (Tensor): The sparsity layout of the indices block-sparse tensor.
+        dim (int): The dimension along which the operation is conducted.
         size_target (torch.Size): The size of the block-sparse target tensor in regular form.
         sparsity_block_size (int): The size of the sparsity blocks.
         triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
@@ -31,6 +32,8 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
     sparsity_lut_i = torch.nonzero(sparsity_layout_indices).contiguous()
+    adjusted_dim = dim % 3
     output = torch.zeros(size_target[0], size_target[1] // sparsity_block_size, size_target[2] // sparsity_block_size,
                          dtype=torch.bool, device=indices.device)
@@ -55,6 +58,7 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
       i_b, i_b_s, i_r_s, i_c_s,
       sparsity_lut_i,
       s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+      adjusted_dim,
       output,
       o_b, o_b_s, o_r_s, o_c_s,
       sparsity_block_size,
@@ -68,6 +72,7 @@ def kernel_distribution_layout(i,
                                i_b, i_b_s, i_r_s, i_c_s,
                                s_lut_i,
                                s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+                               dim,
                                o,
                                o_b, o_b_s, o_r_s, o_c_s,
                                sparsity_block_size,
@@ -86,17 +91,30 @@ def kernel_distribution_layout(i,
     spa_row_i_msk = (spa_row_i_idx < s_lut_i_r * s_lut_i_r_s)
     spa_row_i = tl.load(s_lut_i + spa_row_i_idx, mask=spa_row_i_msk)
+    spa_col_i_idx = (pid_blk * s_lut_i_r_s + 2 * s_lut_i_c_s)
+    spa_col_i_msk = (spa_col_i_idx < s_lut_i_r * s_lut_i_r_s)
+    spa_col_i = tl.load(s_lut_i + spa_col_i_idx, mask=spa_col_i_msk)
     blk_i_idx = (pid_blk * i_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
     blk_i_msk = (blk_i_idx < i_b * i_b_s)
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk)
-    blk_i = blk_i // sparsity_block_size
+    dst_bat_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_i, dtype=tl.int32)
+    dst_row_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_i, dtype=tl.int32)
+    dst_col_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_i, dtype=tl.int32)
+    if dim == 0:
+        dst_bat_idx = blk_i
+    elif dim == 1:
+        dst_row_idx = blk_i // sparsity_block_size
+    elif dim == 2:
+        dst_col_idx = blk_i // sparsity_block_size
     blk_v = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), 1, dtype=tl.int32)
-    blk_o_idx = ((spa_bat_i * o_b_s) +
-                 (spa_row_i * o_r_s) +
-                 (blk_i * o_c_s))
+    blk_o_idx = ((dst_bat_idx * o_b_s) +
+                 (dst_row_idx * o_r_s) +
+                 (dst_col_idx * o_c_s))
     blk_o_msk = (blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, blk_v, mask=blk_o_msk)

blksprs/ops/distribution.py CHANGED Viewed

@@ -3,19 +3,23 @@ import triton
 from torch import Tensor
 from triton import language as tl
+from blksprs.ops.conversion import to_dense
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import get_triton_block_size, stride
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_dtype_int, validate_sparsity_block_size, validate_triton_block_size
-def gather(src: BlksprsTensor, sparsity_layout_src: Tensor, idx: BlksprsTensor, sparsity_layout_idx: Tensor,
+def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
+           dim: int,
+           idx: BlksprsTensor, sparsity_layout_idx: Tensor,
            sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
     """Applies a gather operation on a block-sparse tensor in compressed form.
     Args:
         src (BlksprsTensor): The source block-sparse tensor in compressed form to gather from.
         sparsity_layout_src (Tensor): The sparsity layout of the source block-sparse tensor.
+        dim (int): The dimension along which to gather.
         idx (BlksprsTensor): The block-sparse indices tensor in compressed form specifying how to gather from the source tensor.
         sparsity_layout_idx (Tensor): The sparsity layout of the indices block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
@@ -46,16 +50,18 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor, idx: BlksprsTensor,
     validate_contiguous(sparsity_layout_src, sparsity_reverse_lut_x,
                         sparsity_layout_idx, sparsity_lut_i)
+    adjusted_dim = dim % 3
     return BlksprsTensor(_BlocksparseGather.apply(src, sparsity_layout_src, sparsity_reverse_lut_x,
-                                    idx, sparsity_layout_idx, sparsity_lut_i,
-                                    sparsity_block_size, triton_block_size))
+                                                  adjusted_dim, idx, sparsity_layout_idx, sparsity_lut_i,
+                                                  sparsity_block_size, triton_block_size))
 class _BlocksparseGather(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_x: Tensor, sparsity_reverse_lut_x: Tensor,
-                i: Tensor, sparsity_layout_i: Tensor, sparsity_lut_i: Tensor,
+                dim: int, i: Tensor, sparsity_layout_i: Tensor, sparsity_lut_i: Tensor,
                 sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
         output = torch.empty_like(i, dtype=x.dtype)
@@ -82,6 +88,7 @@ class _BlocksparseGather(torch.autograd.Function):
           x_b, x_b_s, x_r_s, x_c_s,
           s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
           sparsity_reverse_lut_x,
+          dim,
           i,
           i_b, i_b_s, i_r_s, i_c_s,
           output,
@@ -91,6 +98,7 @@ class _BlocksparseGather(torch.autograd.Function):
           triton_block_size))
         ctx.save_for_backward(sparsity_layout_x, i, sparsity_layout_i)
+        ctx.dim = dim
         ctx.sparsity_block_size = sparsity_block_size
         ctx.triton_block_size = triton_block_size
@@ -99,15 +107,15 @@ class _BlocksparseGather(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
         sparsity_layout_x, i, sparsity_layout_i = ctx.saved_tensors
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         triton_block_size = ctx.triton_block_size
         return scatter_reduce(grad_output, sparsity_layout_i,
-                              i,
-                              sparsity_layout_x,
-                              sparsity_block_size,
+                              dim, i,
+                              sparsity_layout_x, sparsity_block_size,
                               reduce_op="sum",
-                              triton_block_size=triton_block_size), None, None, None, None, None, None, None
+                              triton_block_size=triton_block_size), None, None, None, None, None, None, None, None
     @staticmethod
     @triton.jit
@@ -115,6 +123,7 @@ class _BlocksparseGather(torch.autograd.Function):
                                   x_b, x_b_s, x_r_s, x_c_s,
                                   s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
                                   r_lut_x,
+                                  dim,
                                   i,
                                   i_b, i_b_s, i_r_s, i_c_s,
                                   o,
@@ -136,6 +145,10 @@ class _BlocksparseGather(torch.autograd.Function):
         spa_row_o_msk = (spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
         spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
+        spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
+        spa_col_o_msk = (spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
         # Load index values
         blk_i_idx = ((pid_blk * i_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
@@ -143,33 +156,50 @@ class _BlocksparseGather(torch.autograd.Function):
         blk_i_msk = (blk_i_idx < i_b * i_b_s)
         blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
-        # Get positions of sparsity blocks
+        # Get indices of sparsity blocks and positions within the blocks
         pos_spa_blk_x = blk_i // sparsity_block_size
-        pos_spa_col_x = blk_i % sparsity_block_size
+        pos_spa_int_x = blk_i % sparsity_block_size
+        rev_dst_bat_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_o, dtype=tl.int32)
+        rev_dst_row_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_o, dtype=tl.int32)
+        rev_dst_col_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_o, dtype=tl.int32)
+        dst_row_x = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        dst_col_x = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        if dim == 0:
+            rev_dst_bat_x = blk_i
+        elif dim == 1:
+            rev_dst_row_x = pos_spa_blk_x
+            dst_row_x = pos_spa_int_x * x_r_s
+        elif dim == 2:
+            rev_dst_col_x = pos_spa_blk_x
+            dst_col_x = pos_spa_int_x * x_c_s
         # Load reverse sparsity indices for x
-        rev_idx_spa_x_idx = ((spa_bat_o * s_l_x_b_s) +
-                             (spa_row_o * s_l_x_r_s) +
-                             (pos_spa_blk_x * s_l_x_c_s))
+        rev_idx_spa_x_idx = ((rev_dst_bat_x * s_l_x_b_s) +
+                             (rev_dst_row_x * s_l_x_r_s) +
+                             (rev_dst_col_x * s_l_x_c_s))
         rev_idx_spa_x_msk = (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
         rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
         # Load x values
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     (pos_spa_col_x * x_c_s))
-        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+                     dst_row_x +
+                     dst_col_x)
+        blk_x_msk = ((blk_x_idx < x_b * x_b_s) & rev_idx_spa_x_msk != -1)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Store output
         blk_o_idx = ((pid_blk * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx < o_b * o_b_s) &  rev_idx_spa_x_msk != -1)
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
 def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
+            dim: int,
             idx: BlksprsTensor,
             sparsity_layout_tgt: Tensor,
             sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
@@ -184,6 +214,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
+                   dim: int,
                    idx: BlksprsTensor,
                    sparsity_layout_tgt: Tensor,
                    sparsity_block_size: int,
@@ -193,6 +224,7 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
     Args:
         src (BlksprsTensor): The source block-sparse tensor in compressed form to scatter from.
         sparsity_layout_src (Tensor): The sparsity layout of the source block-sparse tensor.
+        dim (int): The dimension along which to scatter.
         idx (BlksprsTensor): The block-sparse indices tensor in compressed form specifying how to scatter to the target tensor.
         sparsity_layout_tgt (Tensor): The sparsity layout of the target block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
@@ -230,18 +262,20 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
     validate_contiguous(sparsity_layout_src, sparsity_lut_x,
                         sparsity_layout_tgt, sparsity_reverse_lut_o)
+    adjusted_dim = dim % 3
     return BlksprsTensor(_BlocksparseScatterReduce.apply(src, sparsity_layout_src, sparsity_lut_x,
-                                           idx,
-                                           sparsity_layout_tgt, sparsity_reverse_lut_o,
-                                           sparsity_block_size, n_sparse_blocks,
-                                           reduce_op, triton_block_size))
+                                                         adjusted_dim, idx,
+                                                         sparsity_layout_tgt, sparsity_reverse_lut_o,
+                                                         sparsity_block_size, n_sparse_blocks,
+                                                         reduce_op, triton_block_size))
 class _BlocksparseScatterReduce(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_x: Tensor, sparsity_lut_x: Tensor,
-                i: Tensor,
+                dim: int, i: Tensor,
                 sparsity_layout_o: Tensor, sparsity_reverse_lut_o: Tensor,
                 sparsity_block_size: int, n_sparse_blocks: int,
                 reduce_op: str, triton_block_size: int) -> Tensor:
@@ -274,10 +308,11 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
          (x,
           x_b, x_b_s, x_r_s, x_c_s,
           sparsity_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          dim,
           i,
           i_b, i_b_s, i_r_s, i_c_s,
           output,
-          o_b, o_b_s, o_r_s, o_c_s,
+          o_b, o_b_s,
           s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
           sparsity_reverse_lut_o,
           reduce_op_ind,
@@ -285,6 +320,7 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
           triton_block_size))
         ctx.save_for_backward(sparsity_layout_x, i, sparsity_layout_o)
+        ctx.dim = dim
         ctx.sparsity_block_size = sparsity_block_size
         ctx.reduce_op = reduce_op
         ctx.triton_block_size = triton_block_size
@@ -294,13 +330,14 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
         sparsity_layout_x, i, sparsity_layout_o = ctx.saved_tensors
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         reduce_op = ctx.reduce_op
         triton_block_size = ctx.triton_block_size
         if reduce_op == "sum":
-            return gather(grad_output, sparsity_layout_o, i, sparsity_layout_x, sparsity_block_size,
-                          triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None
+            return gather(grad_output, sparsity_layout_o, dim, i, sparsity_layout_x, sparsity_block_size,
+                          triton_block_size=triton_block_size), None, None, None, None, None, None, None, None, None, None
         else:
             raise ValueError(f"Reduction operation '{reduce_op}' does not support backward pass")
@@ -309,10 +346,11 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
     def kernel_blocksparse_scatter(x,
                                    x_b, x_b_s, x_r_s, x_c_s,
                                    s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+                                   dim,
                                    i,
                                    i_b, i_b_s, i_r_s, i_c_s,
                                    o,
-                                   o_b, o_b_s, o_r_s, o_c_s,
+                                   o_b, o_b_s,
                                    s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
                                    r_lut_o,
                                    reduce_op_ind,
@@ -332,6 +370,10 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
         spa_row_x_msk = (spa_row_x_idx < s_lut_x_r * s_lut_x_r_s)
         spa_row_x = tl.load(s_lut_x + spa_row_x_idx, mask=spa_row_x_msk)
+        spa_col_x_idx = (pid_blk * s_lut_x_r_s + 2 * s_lut_x_c_s)
+        spa_col_x_msk = (spa_col_x_idx < s_lut_x_r * s_lut_x_r_s)
+        spa_col_x = tl.load(s_lut_x + spa_col_x_idx, mask=spa_col_x_msk)
         # Load x values
         blk_x_idx = ((pid_blk * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
@@ -346,22 +388,38 @@ class _BlocksparseScatterReduce(torch.autograd.Function):
         blk_i_msk = (blk_i_idx < i_b * i_b_s)
         blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
-        # Get positions of sparsity blocks
-        pos_spa_blk_o = blk_i // sparsity_block_size
-        pos_spa_col_o = blk_i % sparsity_block_size
+        # Get indices of sparsity blocks and positions within the blocks
+        pos_spa_blk_x = blk_i // sparsity_block_size
+        pos_spa_int_x = blk_i % sparsity_block_size
+        rev_dst_bat_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_x, dtype=tl.int32)
+        rev_dst_row_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_x, dtype=tl.int32)
+        rev_dst_col_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_x, dtype=tl.int32)
+        dst_row_o = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        dst_col_o = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+                     .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
+        if dim == 0:
+            rev_dst_bat_o = blk_i
+        elif dim == 1:
+            rev_dst_row_o = pos_spa_blk_x
+            dst_row_o = pos_spa_int_x * x_r_s
+        elif dim == 2:
+            rev_dst_col_o = pos_spa_blk_x
+            dst_col_o = pos_spa_int_x * x_c_s
         # Load reverse sparsity indices for o
-        rev_idx_spa_o_idx = ((spa_bat_x * s_l_o_b_s) +
-                             (spa_row_x * s_l_o_r_s) +
-                             (pos_spa_blk_o * s_l_o_c_s))
+        rev_idx_spa_o_idx = ((rev_dst_bat_o * s_l_o_b_s) +
+                             (rev_dst_row_o * s_l_o_r_s) +
+                             (rev_dst_col_o * s_l_o_c_s))
         rev_idx_spa_o_msk = (rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
         rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
         # Store output
         blk_o_idx = ((rev_idx_spa_o * o_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     (pos_spa_col_o * o_c_s))
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+                     dst_row_o +
+                     dst_col_o)
+        blk_o_msk = ((blk_o_idx < o_b * o_b_s) & rev_idx_spa_o_msk != -1)
         if reduce_op_ind == 0:
             tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

blksprs/ops/experimental/distribution_mdi.py CHANGED Viewed

@@ -153,6 +153,10 @@ class _BlocksparseGatherMDI(torch.autograd.Function):
         rev_idx_spa_x_msk = (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
         rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
+        if rev_idx_spa_x == -1:
+            tl.device_assert(False)
+            return
         # Load x values
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
@@ -342,6 +346,10 @@ class _BlocksparseScatterReduceMDI(torch.autograd.Function):
         rev_idx_spa_o_msk = (rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
         rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
+        if rev_idx_spa_o == -1:
+            tl.device_assert(False)
+            return
         # Store output
         blk_o_idx = ((rev_idx_spa_o * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +

blksprs/ops/misc/row_wise.py CHANGED Viewed

@@ -117,6 +117,10 @@ def kernel_blocksparse_row_wise_sum(x,
     rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
@@ -240,6 +244,10 @@ def kernel_blocksparse_row_wise_max(x,
     rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+    if rev_idx_spa == -1:
+        tl.device_assert(False)
+        return
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])

blksprs/ops/softmax.py CHANGED Viewed

@@ -238,6 +238,10 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
         rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
+        if rev_idx_spa_s == -1:
+            tl.device_assert(False)
+            return
         blk_s_idx = (rev_idx_spa_s * s_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])

{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.8.3
+Version: 1.9
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs

{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
 blksprs/__init__.py,sha256=YMrERuEf1hTv5vVdOvPEzh9rESn4uqOB7WHB12Qs5lU,1836
-blksprs/layouting/distribution_layout.py,sha256=wmj1SwWyY_fhbvMmh6AXrR77LoSp6xLwUWCCyO9i5lk,4239
+blksprs/layouting/distribution_layout.py,sha256=9f_Bx2YQF4LTH95C0S7OuB9eeOuh73NcE0Z7Wrtug38,5034
 blksprs/layouting/sparsity_layout.py,sha256=-sScIn4hhG35j9BXytrojEzp8jnFkMargJjtivPV1fc,9755
 blksprs/ops/conversion.py,sha256=ol-iV45wDzp9G1dJEkY53EdrvnmHzcl7QQmPJ-xqQTs,22410
-blksprs/ops/distribution.py,sha256=fXZV6UegCVpIwzh-A825OSYClHWu5k0UMYdO2UGDUpM,17067
+blksprs/ops/distribution.py,sha256=OWTH_dfO43uIMY6S44wpvRoIBuKzaTy1f57BOEf7EYA,19925
 blksprs/ops/matmul.py,sha256=yh2ZnO0ZltT1AgadiFP0vX28YJ4n74xO-I_5vFUmOmA,11452
 blksprs/ops/partitioning.py,sha256=K0ExR2a3W62d_9xxCJzsdJDLgtbxTI6P8loOOBdhPzE,7674
 blksprs/ops/repeat.py,sha256=IvSIRbuyFn0b57LObymLgup0LqlWQ3ndIw-QuiYQcaU,14564
-blksprs/ops/softmax.py,sha256=CDQT2KnwkJ4hGIgT0EUp6P92uiYpCdJQ9zxcdgSAAJA,12102
+blksprs/ops/softmax.py,sha256=V-1vqRefjjwSp6JPwKxVxh5pTng9gOdtgGlXHDPbpYM,12190
 blksprs/ops/transpose.py,sha256=jxzFFffrj4S_9tiCrwwUMdz6EA98o1dziWXjlqb64a4,6859
-blksprs/ops/experimental/distribution_mdi.py,sha256=HaRUu6LTWATzjuHWgddIUE-0fgY-O87STpJO4JY7k_8,20357
+blksprs/ops/experimental/distribution_mdi.py,sha256=F_0tl4Gn-9JZs_TZfDtZqO_RPFl7sejqQNF8UNIoCbs,20533
 blksprs/ops/misc/broadcast_ops.py,sha256=cPtRJa3pkZfY1QG51CJ-zDn4SK-CRpX5LEXoKGGMvRU,5418
 blksprs/ops/misc/exp.py,sha256=FnSFosBfJHuiEbD0MD-i4axLghRn4a0f8KvHXrKBB6M,3802
-blksprs/ops/misc/row_wise.py,sha256=SvJuNww-_QoVKTyTjMvjmzHlBuUlTKamkuq_rKzwAqs,17081
+blksprs/ops/misc/row_wise.py,sha256=U4Kk0-P4oOuMNjMHXxP2gP9njMIeMfz8RZrzItNIF94,17229
 blksprs/utils/benchmarking.py,sha256=4pLVlnPW_2EM-NT3n4SClaRznVYEljztLbJcccz8kZE,1360
 blksprs/utils/blksprs_tensor.py,sha256=VjplBgDhnf9sxf-1R5feA0xp5FDCDdaeZmCeoIRdCnc,151
 blksprs/utils/processing.py,sha256=hYsFxEbQKcbqU4WtZWusPnWMHg8ZAZF1SKZJYjez9aU,2060
 blksprs/utils/tools.py,sha256=r7Y4C37vfSWUyQTGwa8NyRqgovmsq9hMufkenqYHOxo,539
 blksprs/utils/validation.py,sha256=IZxH2HZpePmv7lRqLsSwV_6FwsdnTXv9q4j98vCMSsQ,4195
-blksprs-1.8.3.dist-info/METADATA,sha256=DZkJ_HeetF1V6-_F6GeG0uXT-QmttMFOq4ao8fiSMgQ,8458
-blksprs-1.8.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-blksprs-1.8.3.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
-blksprs-1.8.3.dist-info/RECORD,,
+blksprs-1.9.dist-info/METADATA,sha256=9mMjmvJ2_Rz0uyiY9S8SKTRcs6YW5Jk1w6PRobh6Q3c,8456
+blksprs-1.9.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+blksprs-1.9.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
+blksprs-1.9.dist-info/RECORD,,

{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{blksprs-1.8.3.dist-info → blksprs-1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

blksprs 1.8.3__py3-none-any.whl → 1.9__py3-none-any.whl

blksprs 1.8.3py3-none-any.whl → 1.9py3-none-any.whl