PyPI - blksprs - Versions diffs - 1.9.3__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend

blksprs 1.9.3py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

blksprs/__init__.py +0 -6
blksprs/layouting/distribution_layout.py +6 -6
blksprs/layouting/sparsity_layout.py +7 -7
blksprs/ops/conversion.py +19 -21
blksprs/ops/distribution.py +14 -14
blksprs/ops/flow.py +12 -12
blksprs/ops/matmul.py +8 -8
blksprs/ops/misc/broadcast_ops.py +6 -6
blksprs/ops/misc/exp.py +2 -2
blksprs/ops/misc/row_wise.py +16 -19
blksprs/ops/partitioning.py +24 -10
blksprs/ops/softmax.py +17 -16
blksprs/ops/transpose.py +9 -8
{blksprs-1.9.3.dist-info → blksprs-1.10.1.dist-info}/METADATA +18 -14
blksprs-1.10.1.dist-info/RECORD +24 -0
blksprs/ops/experimental/distribution_mdi.py +0 -447
blksprs-1.9.3.dist-info/RECORD +0 -25
{blksprs-1.9.3.dist-info → blksprs-1.10.1.dist-info}/WHEEL +0 -0
{blksprs-1.9.3.dist-info → blksprs-1.10.1.dist-info}/top_level.txt +0 -0

blksprs/ops/misc/row_wise.py CHANGED Viewed

@@ -104,17 +104,17 @@ def kernel_blocksparse_row_wise_sum(x,
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
-    spa_bat_msk = (spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
+    spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
     spa_bat = tl.load(s_lut_x + spa_bat_idx, mask=spa_bat_msk)
     spa_row_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
-    spa_row_msk = (spa_row_idx < s_lut_x_r * s_lut_x_r_s)
+    spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_x_r * s_lut_x_r_s)
     spa_row = tl.load(s_lut_x + spa_row_idx, mask=spa_row_msk)
     # Load reverse sparsity index for current block
     rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
                        spa_row * s_l_o_r_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
+    rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
     if rev_idx_spa == -1:
@@ -124,7 +124,7 @@ def kernel_blocksparse_row_wise_sum(x,
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = (blk_idx < x_b * x_b_s)
+    blk_msk = (blk_idx >= 0 and blk_idx < x_b * x_b_s)
     blk = tl.load(x + blk_idx, mask=blk_msk)
     buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
@@ -132,7 +132,7 @@ def kernel_blocksparse_row_wise_sum(x,
     o_idx = (rev_idx_spa * o_b_s +
              ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
              (tl.arange(0, 1))[None, :])
-    o_msk = (o_idx < o_b * o_b_s)
+    o_msk = (o_idx >= 0 and o_idx < o_b * o_b_s)
     tl.atomic_add(o + o_idx, buf, o_msk)
@@ -231,17 +231,17 @@ def kernel_blocksparse_row_wise_max(x,
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
-    spa_bat_msk = (spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
+    spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
     spa_bat = tl.load(s_lut_x + spa_bat_idx, mask=spa_bat_msk)
     spa_row_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
-    spa_row_msk = (spa_row_idx < s_lut_x_r * s_lut_x_r_s)
+    spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_x_r * s_lut_x_r_s)
     spa_row = tl.load(s_lut_x + spa_row_idx, mask=spa_row_msk)
     # Load reverse sparsity index for current block
     rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
                        spa_row * s_l_o_r_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
+    rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
     if rev_idx_spa == -1:
@@ -251,7 +251,7 @@ def kernel_blocksparse_row_wise_max(x,
     blk_idx = ((pid_blk * x_b_s) +
                ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = (blk_idx < x_b * x_b_s)
+    blk_msk = (blk_idx >= 0 and blk_idx < x_b * x_b_s)
     blk = tl.load(x + blk_idx, mask=blk_msk)
     buf = tl.reshape(tl.max(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
@@ -259,7 +259,7 @@ def kernel_blocksparse_row_wise_max(x,
     o_idx = (rev_idx_spa * o_b_s +
              ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
              (tl.arange(0, 1))[None, :])
-    o_msk = (o_idx < o_b * o_b_s)
+    o_msk = (o_idx >= 0 and o_idx < o_b * o_b_s)
     tl.atomic_max(o + o_idx, buf, o_msk)
@@ -356,17 +356,17 @@ def kernel_blocksparse_row_wise_add(x,
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-    spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+    spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
     spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
     spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-    spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+    spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
     spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
     # Get reverse sparsity indices for s
     rev_idx_spa_s_idx = (spa_bat * s_l_y_b_s +
                          spa_row * s_l_y_r_s)
-    rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_y_b * s_l_y_b_s)
+    rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_y_b * s_l_y_b_s)
     rev_idx_spa_s = tl.load(r_lut_y + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
     if rev_idx_spa_s == -1:
@@ -377,25 +377,22 @@ def kernel_blocksparse_row_wise_add(x,
     blk_x_idx = ((pid_blk * x_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx < x_b * x_b_s)
+    blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load sum block
     blk_s_idx = (rev_idx_spa_s * y_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
                  (tl.arange(0, 1) * y_c_s)[None, :])
-    blk_s_msk = (blk_s_idx < y_b * y_b_s)
+    blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < y_b * y_b_s)
     blk_s = tl.load(y + blk_s_idx, mask=blk_s_msk)
     # Compute exp
     buf = blk_x + tl.broadcast_to(blk_s, (TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE))
-    # debug
-    asdf = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), 1.0, dtype=tl.float32)
     # Store block
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx < o_b * o_b_s)
+    blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs/ops/partitioning.py CHANGED Viewed

@@ -9,13 +9,14 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
 def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
-          sparsity_block_size: int, triton_block_size: int = None) -> (BlksprsTensor, Tensor):
+          dim: int, sparsity_block_size: int, triton_block_size: int = None) -> (BlksprsTensor, Tensor):
     """Splits a block-sparse tensor in compressed form along the last dimension into partitions.
     Args:
         x (BlksprsTensor): A block-sparse tensor in compressed form.
         sparsity_layout (Tensor): The sparsity layout of the block-sparse tensor.
         partitions (int): The number of partitions to split the block-sparse tensor into.
+        dim (int): The dimension along which to split the tensor. Currently only supports dim=2.
         sparsity_block_size (int): The size of the sparsity blocks.
         triton_block_size (int): The block size to use for the triton kernel (default ``None``).
@@ -54,17 +55,22 @@ def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
     validate_contiguous(sparsity_layout_output, sparsity_lut, sparsity_reverse_lut)
+    adjusted_dim = dim % 3
+    if adjusted_dim != 2:
+        raise NotImplementedError("Currently only supports dim=2")
     return BlksprsTensor(_BlocksparseSplit.apply(x, sparsity_layout_output, sparsity_lut, sparsity_reverse_lut, partitions,
-                                   sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_output
+                                   adjusted_dim, sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_output
 class _BlocksparseSplit(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
-                num_partitions: int, sparsity_block_size: int, n_sparse_blocks: int, triton_block_size: int) -> Tensor:
+                num_partitions: int, dim: int, sparsity_block_size: int, n_sparse_blocks: int, triton_block_size: int) -> Tensor:
         ctx.save_for_backward(sparsity_layout_o)
         ctx.num_partitions = num_partitions
+        ctx.dim = dim
         return flow_forward(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
                             n_sparse_blocks, triton_block_size)
@@ -73,21 +79,23 @@ class _BlocksparseSplit(torch.autograd.Function):
     def backward(ctx, grad_output):
         sparsity_layout = ctx.saved_tensors[0]
         num_partitions = ctx.num_partitions
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         triton_block_size = ctx.triton_block_size
-        return merge(grad_output, sparsity_layout, num_partitions,
-                     sparsity_block_size, triton_block_size)[0], None, None, None, None, None, None, None
+        return merge(grad_output, sparsity_layout, num_partitions, dim,
+                     sparsity_block_size, triton_block_size)[0], None, None, None, None, None, None, None, None
 def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
-          sparsity_block_size: int, triton_block_size: int = None) -> (BlksprsTensor, Tensor):
+          dim: int, sparsity_block_size: int, triton_block_size: int = None) -> (BlksprsTensor, Tensor):
     """Merges the specified partitions of a block-sparse tensor in compressed form along the last dimension.
     Args:
         x (BlksprsTensor): A block-sparse tensor in compressed form.
         sparsity_layout (Tensor): The sparsity layout of the block-sparse tensor.
         partitions (int): The number of partitions to be merged.
+        dim (int): The dimension along which to merge the tensor. Currently only supports dim=2.
         sparsity_block_size (int): The size of the sparsity blocks.
         triton_block_size (int): The block size to use for the triton kernel (default ``None``).
@@ -128,17 +136,22 @@ def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
     validate_contiguous(sparsity_layout_output, sparsity_lut, sparsity_reverse_lut)
+    adjusted_dim = dim % 3
+    if adjusted_dim != 2:
+        raise NotImplementedError("Currently only supports dim=2")
     return BlksprsTensor(_BlocksparseMerge.apply(x, sparsity_layout_output, sparsity_lut, sparsity_reverse_lut, partitions,
-                                   sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_output
+                                   adjusted_dim, sparsity_block_size, n_sparse_blocks, triton_block_size)), sparsity_layout_output
 class _BlocksparseMerge(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
-                num_partitions: int, sparsity_block_size: int, n_sparse_blocks: int, triton_block_size: int) -> Tensor:
+                num_partitions: int, dim: int, sparsity_block_size: int, n_sparse_blocks: int, triton_block_size: int) -> Tensor:
         ctx.save_for_backward(sparsity_layout_o)
         ctx.num_partitions = num_partitions
+        ctx.dim = dim
         return flow_forward(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
                             n_sparse_blocks, triton_block_size)
@@ -147,10 +160,11 @@ class _BlocksparseMerge(torch.autograd.Function):
     def backward(ctx, grad_output):
         sparsity_layout = ctx.saved_tensors[0]
         num_partitions = ctx.num_partitions
+        dim = ctx.dim
         sparsity_block_size = ctx.sparsity_block_size
         triton_block_size = ctx.triton_block_size
-        return split(grad_output, sparsity_layout, num_partitions,
-                     sparsity_block_size, triton_block_size)[0], None, None, None, None, None, None, None
+        return split(grad_output, sparsity_layout, num_partitions, dim,
+                     sparsity_block_size, triton_block_size)[0], None, None, None, None, None, None, None, None

blksprs/ops/softmax.py CHANGED Viewed

@@ -11,7 +11,8 @@ from blksprs.utils.validation import validate_contiguous, validate_dimensions, v
     validate_sparsity, validate_sparsity_block_size, validate_triton_block_size
-def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
+def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
+            triton_block_size: int = None) -> BlksprsTensor:
     """Computes the softmax of a block-sparse tensor in compressed form.
     Note:
@@ -47,9 +48,9 @@ def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
     validate_contiguous(sparsity_layout, sparsity_lut, sparsity_reverse_lut_rws)
     return BlksprsTensor(_BlocksparseSoftmax.apply(x, sparsity_layout,
-                                     sparsity_lut,
-                                     sparsity_reverse_lut_rws,
-                                     sparsity_block_size, triton_block_size))
+                                                   sparsity_lut,
+                                                   sparsity_reverse_lut_rws,
+                                                   sparsity_block_size, triton_block_size))
 class _BlocksparseSoftmax(torch.autograd.Function):
@@ -168,17 +169,17 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         # Get position of current sparsity block consisting of its batch and row index
         spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-        spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+        spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
         spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
         spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-        spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+        spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
         spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
         # Get reverse sparsity indices for s
         rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
                              spa_row * s_l_s_r_s)
-        rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
+        rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
         rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
         if rev_idx_spa_s == -1:
@@ -189,14 +190,14 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         blk_x_idx = ((pid_blk * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+        blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Load sum block
         blk_s_idx = (rev_idx_spa_s * s_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = (blk_s_idx < s_b * s_b_s)
+        blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < s_b * s_b_s)
         blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
         # Compute softmax
@@ -226,16 +227,16 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         # Get position of current sparsity block consisting of its batch and row index
         spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-        spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+        spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
         spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
         spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-        spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+        spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
         spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
         rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
                              spa_row * s_l_s_r_s)
-        rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
+        rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
         rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
         if rev_idx_spa_s == -1:
@@ -245,19 +246,19 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         blk_s_idx = (rev_idx_spa_s * s_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = (blk_s_idx < s_b * s_b_s)
+        blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < s_b * s_b_s)
         blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
         blk_g_idx = ((pid_blk * g_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_c_s)[None, :])
-        blk_g_msk = (blk_g_idx < g_b * g_b_s)
+        blk_g_msk = (blk_g_idx >= 0 and blk_g_idx < g_b * g_b_s)
         blk_g = tl.load(g + blk_g_idx, mask=blk_g_msk)
         blk_x_idx = ((pid_blk * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+        blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         buf = blk_x * (blk_g - blk_s)
@@ -265,5 +266,5 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         blk_o_idx = ((pid_blk * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs/ops/transpose.py CHANGED Viewed

@@ -50,8 +50,9 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: in
     validate_contiguous(sparsity_layout_t, sparsity_lut, sparsity_reverse_lut)
-    return BlksprsTensor(_BlocksparseTranspose.apply(x, sparsity_layout_t, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
-                                       n_sparse_blocks, triton_block_size)), sparsity_layout_t
+    return BlksprsTensor(
+        _BlocksparseTranspose.apply(x, sparsity_layout_t, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+                                    n_sparse_blocks, triton_block_size)), sparsity_layout_t
 class _BlocksparseTranspose(torch.autograd.Function):
@@ -122,22 +123,22 @@ class _BlocksparseTranspose(torch.autograd.Function):
         # Get sparsity index of current output block consisting of its batch, row, and column index
         spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-        spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+        spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
         spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
         spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-        spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+        spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
         spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
         spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
-        spa_col_msk = (spa_col_idx < s_lut_r * s_lut_r_s)
+        spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
         spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
         # Get reverse sparsity index
         rev_idx_spa_idx = (spa_bat * s_l_b_s +
                            spa_row * s_l_r_s +
                            spa_col * s_l_c_s)
-        rev_idx_spa_msk = (rev_idx_spa_idx < s_l_b * s_l_b_s)
+        rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_b * s_l_b_s)
         rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
         if rev_idx_spa == -1:
@@ -147,7 +148,7 @@ class _BlocksparseTranspose(torch.autograd.Function):
         blk_x_idx = (rev_idx_spa * x_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+        blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         blk_x_t = tl.trans(blk_x)
@@ -155,5 +156,5 @@ class _BlocksparseTranspose(torch.autograd.Function):
         blk_o_idx = (pid_blk * o_b_s +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, blk_x_t, mask=blk_o_msk)

{blksprs-1.9.3.dist-info → blksprs-1.10.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: blksprs
-Version: 1.9.3
+Version: 1.10.1
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -23,14 +23,6 @@ Requires-Dist: build; extra == "build"
 [![GitHub Release](https://img.shields.io/github/v/release/FelixSchoen/blksprs?include_prereleases&label=Latest%20Release)](https://github.com/FelixSchoen/blksprs/releases)
 [![Python Version](https://img.shields.io/badge/Python%20Version-3.11-blue)](https://www.python.org/downloads/release/python-3119/)
-## Important Notice
-🚨 **Non-Final API** 🚨
-Although it already supports a wide variety of functions, this library is still under active development and the API is
-subject to change. For feature requests or bug reports, please open an [issue](https://github.com/FelixSchoen/blksprs/issues).
-We also encourage [pull requests](https://github.com/FelixSchoen/blksprs/pulls).
 ## Overview
 A lightweight and efficient library for operations on block-sparse matrices in PyTorch using Triton.
@@ -44,7 +36,7 @@ Currently supported operations (includes gradient calculation):
 - Scatter (_supports either no reduction or summation, gradients are only available for summation_)
 - Repeat (_supports target sparsity layout_)
 - Repeat Interleave (_supports target sparsity layout_)
-- Splitting and merging of matrices along the last dimension
+- Splitting and merging of matrices (_currently* only supports splitting and merging along the last dimension_)
 - Conversion to and from sparse form
 - Conversion to different sparsity layouts and different sparsity block sizes
@@ -70,13 +62,15 @@ Furthermore, the library provides a set of utility functions
 - for the creation of sparsity layouts based on existing
 dense tensors and for the scatter operation (module ``bs.layouting``),
 - for the application of ``nn.Linear``, ``nn.Dropout``, and ``nn.LayerNorm`` layers to block-sparse tensors,
-- as well as utility functions to apply linear layers,
-ensure correct input dimensionality, and validate input (module ``bs.utils``).
+- as well as utility functions to ensure correct input dimensionality, and validate input (module ``bs.utils``).
+_* see the [Roadmap](#roadmap) section for more information_
 ## Installation
-Note that due to the dependency on [Triton](https://github.com/triton-lang/triton) this library is only compatible with
-the Linux platform.
+Note that due to the dependency on [Triton](https://github.com/triton-lang/triton) this library is **only compatible with
+the Linux platform**.
+Keep track of this [issue](https://github.com/triton-lang/triton/issues/1640) for updates.
 We recommend installing blksprs from [PyPI](https://pypi.org/project/blksprs/) using pip:
@@ -92,6 +86,16 @@ We recommend installing blksprs from [PyPI](https://pypi.org/project/blksprs/) u
 See [`CHANGELOG.md`](https://github.com/FelixSchoen/blksprs/blob/main/CHANGELOG.md) for a detailed changelog.
+## Roadmap
+Note that since this library covers all our current needs it is in a **bugfix-only** state.
+This means that there are no plans to add new features, e.g., support for dimension specification of the ``split`` and ``merge`` operations.
+We will continue to maintain the library and fix any issues that arise.
+Should you find any bugs please open an [issue](https://github.com/FelixSchoen/blksprs/issues).
+We also encourage [pull requests](https://github.com/FelixSchoen/blksprs/pulls).
+It might be that this changes with future projects, but as of December 2024, we are content with the current state of the library.
 ## Usage
 We provide an example below to demonstrate the usage of the library.

blksprs-1.10.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+blksprs/__init__.py,sha256=wnpk-20jXq7xV0xa-WpHfPQuauI2gEZz9sH-0blKxP0,1766
+blksprs/layouting/distribution_layout.py,sha256=xDGY5-J7uSD8oenlf8bEJ2amMiQG3NBf2klTTydbTJE,5140
+blksprs/layouting/sparsity_layout.py,sha256=IVtHc_nN3ZM2y4GFcys70PqDWmWc7tkHlVGlToErANk,9894
+blksprs/ops/conversion.py,sha256=NK5uXMepPJ9yYh0vnxKwx5_Ffj_bAvhqPVogf_7PY0g,22248
+blksprs/ops/distribution.py,sha256=qK5t5XgQSJxXPced8RohprqCtUMMTaEP2pFm3KU1c8o,20267
+blksprs/ops/flow.py,sha256=SWHDQ5zx0cjnPR0CcAcRNZdSusSAHSU840SwDNUr24g,6437
+blksprs/ops/matmul.py,sha256=LAQyPNwWVmBMRnAex3msLSPD_aG5SblLCMiutJWqvus,11632
+blksprs/ops/partitioning.py,sha256=ugKnpvH36ND7qeJQp56M74qqfACkzcTVuXebzw__28Y,8286
+blksprs/ops/repeat.py,sha256=RCa-dITomA5v12K5Oxa5_ReA361zS7WHPNNHxSp9PGw,8578
+blksprs/ops/softmax.py,sha256=i8NJhvPRYya94AzpN6qiki6_G9KfDrtPifhWd7wbYzk,12496
+blksprs/ops/transpose.py,sha256=oAtUu7QzQnNAH3lvRs_MIvIKpBu9h74f9Sk07AxKnDM,6991
+blksprs/ops/misc/broadcast_ops.py,sha256=pv0nssSDOdDbQFttpqUIs2ZXShqfm2RYCfJH-C5x3H0,5544
+blksprs/ops/misc/exp.py,sha256=ygfw7oD6ALdPwNQX_HelKgO8I3-LCgIXH_x0gWzkUN8,3840
+blksprs/ops/misc/row_wise.py,sha256=DnV5-xEJUbqZlK2fETwHiPQDUMwT-lkc0VUhBlnJ5Y0,17458
+blksprs/utils/benchmarking.py,sha256=4pLVlnPW_2EM-NT3n4SClaRznVYEljztLbJcccz8kZE,1360
+blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
+blksprs/utils/layout_utils.py,sha256=49ZdPS_gMn_IrWty3FARbi2rda5a8g5DmAEL8LOrC30,670
+blksprs/utils/processing.py,sha256=WLuMJQ8v-YovXwcDjhlDn3N31WMZXrtyeeyKSgq_zn4,3642
+blksprs/utils/tools.py,sha256=r7Y4C37vfSWUyQTGwa8NyRqgovmsq9hMufkenqYHOxo,539
+blksprs/utils/validation.py,sha256=CbxBbeQWJo8wox5eMoVzaTlP9FVBwt3-gxUOmi3EUgw,4213
+blksprs-1.10.1.dist-info/METADATA,sha256=5in6lYCZo1bd8urYR0wkTxIiTTAIAANukLpKeZfGasY,9107
+blksprs-1.10.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+blksprs-1.10.1.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
+blksprs-1.10.1.dist-info/RECORD,,

blksprs 1.9.3__py3-none-any.whl → 1.10.1__py3-none-any.whl

blksprs 1.9.3py3-none-any.whl → 1.10.1py3-none-any.whl