PyPI - blksprs - Versions diffs - 0.2b4__py3-none-any.whl → 1.1__py3-none-any.whl - Mend

blksprs 0.2b4py3-none-any.whl → 1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

blksprs/layouting/distribution_layout.py +114 -0
blksprs/layouting/sparsity_layout.py +78 -0
blksprs/misc/broadcast_addition.py +132 -0
blksprs/ops/conversion.py +256 -0
blksprs/ops/distribution.py +362 -0
blksprs/ops/exp.py +101 -0
blksprs/ops/matmul.py +221 -0
blksprs/ops/row_wise_sum.py +231 -0
blksprs/ops/softmax.py +263 -0
blksprs/ops/transpose.py +154 -0
blksprs/utils/tools.py +20 -0
blksprs/utils/validation.py +97 -0
blksprs-1.1.dist-info/METADATA +164 -0
blksprs-1.1.dist-info/RECORD +17 -0
{blksprs-0.2b4.dist-info → blksprs-1.1.dist-info}/WHEEL +1 -1
blksprs/ops/blocksparse.py +0 -589
blksprs-0.2b4.dist-info/METADATA +0 -26
blksprs-0.2b4.dist-info/RECORD +0 -6
{blksprs-0.2b4.dist-info → blksprs-1.1.dist-info}/top_level.txt +0 -0

blksprs/ops/row_wise_sum.py ADDED Viewed

@@ -0,0 +1,231 @@
+import torch
+import triton
+from torch import Tensor
+from triton import language as tl
+from blksprs.utils.tools import get_triton_block_size
+from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
+    validate_sparsity, validate_sparsity_block_size, validate_triton_block_size
+def row_wise_sum(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int,
+                 flag_slice_only: bool = False, triton_block_size: int = None) -> tuple[Tensor, Tensor]:
+    """Computes the row-wise sum of a block-sparse tensor.
+    Returns a block-sparse tensor in compressed form with only one block per row, where the first entry contains the sum
+        of the corresponding row.
+    Note:
+        If ``flag_slice_only`` is set the output will be of shape ``[x.size(0), x.size(1), 1]``.
+    Args:
+        x (Tensor): A block-sparse tensor in compressed form.
+        sparsity_layout (Tensor): The sparsity layout of the block-sparse tensor.
+        sparsity_block_size (int): The size of the sparsity blocks.
+        flag_slice_only (bool, optional): If set the output will be of shape ``[x.size(0), x.size(1), 1]``
+            (default ``False``).
+        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
+    Returns:
+        tuple[Tensor, Tensor]: A tuple containing a block-sparse tensor in compressed form containing the row-wise sum
+            of the input and the sparsity layout of the output tensor.
+    """
+    validate_dimensions(x)
+    validate_contiguous(x)
+    validate_device(x)
+    validate_sparsity(sparsity_block_size, (x, sparsity_layout))
+    validate_sparsity_block_size(sparsity_block_size, x)
+    validate_triton_block_size(triton_block_size, sparsity_block_size)
+    sparsity_lut = torch.nonzero(sparsity_layout).contiguous()
+    sparsity_layout_flat = sparsity_layout.reshape(-1)
+    sparsity_reverse_lut = ((torch.cumsum(sparsity_layout_flat, dim=-1) - 1) *
+                            (sparsity_layout_flat == 1) -
+                            (1 * (sparsity_layout_flat == 0)))
+    sparsity_layout_output, _ = torch.max(sparsity_layout, dim=-1, keepdim=True)
+    sparsity_lut_output = torch.nonzero(sparsity_layout_output).contiguous()
+    sparsity_layout_output_flat = sparsity_layout_output.reshape(-1)
+    sparsity_reverse_lut_output = ((torch.cumsum(sparsity_layout_output_flat, dim=-1) - 1) *
+                                   (sparsity_layout_output_flat == 1) -
+                                   (1 * (sparsity_layout_output_flat == 0)))
+    n_sparse_blocks_output = torch.sum(sparsity_layout_output.to(torch.int)).item()
+    validate_contiguous(sparsity_layout, sparsity_lut, sparsity_reverse_lut,
+                        sparsity_layout_output, sparsity_lut_output, sparsity_reverse_lut_output)
+    return (_BlocksparseRowWiseSum.apply(x,
+                                         sparsity_layout, sparsity_lut, sparsity_reverse_lut,
+                                         sparsity_layout_output, sparsity_lut_output, sparsity_reverse_lut_output,
+                                         n_sparse_blocks_output,
+                                         flag_slice_only,
+                                         sparsity_block_size, triton_block_size),
+            sparsity_layout_output)
+class _BlocksparseRowWiseSum(torch.autograd.Function):
+    IMPLEMENTATION = "atomic_add"
+    @staticmethod
+    def forward(ctx, x: Tensor,
+                sparsity_layout: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
+                sparsity_layout_output: Tensor, sparsity_lut_output: Tensor, sparsity_reverse_lut_output: Tensor,
+                n_sparse_blocks_output: int,
+                flag_slice_only: bool,
+                sparsity_block_size: int, triton_block_size: int) -> Tensor:
+        output = torch.zeros(size=(n_sparse_blocks_output,
+                                   sparsity_block_size,
+                                   1 if flag_slice_only else sparsity_block_size),
+                             device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = x.stride()
+        s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout.size()
+        s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = sparsity_layout.stride()
+        s_lut_x_r, s_lut_x_c = sparsity_lut.size()
+        s_lut_x_r_s, s_lut_x_c_s = sparsity_lut.stride()
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = output.stride()
+        s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
+        s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = sparsity_layout_output.stride()
+        s_lut_o_r, s_lut_o_c = sparsity_lut_output.size()
+        s_lut_o_r_s, s_lut_o_c_s = sparsity_lut_output.stride()
+        if triton_block_size is None:
+            triton_block_size = get_triton_block_size(sparsity_block_size)
+        if _BlocksparseRowWiseSum.IMPLEMENTATION == "basic":
+            triton_grid = lambda meta: [o_b,
+                                        triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"])]
+            (_BlocksparseRowWiseSum.kernel_blocksparse_row_wise_sum[triton_grid]
+             (x,
+              x_b, x_b_s, x_r_s, x_c_s,
+              s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c, s_l_x_c_s,
+              sparsity_reverse_lut,
+              output,
+              o_b, o_b_s, o_r_s,
+              sparsity_lut_output, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+              sparsity_block_size,
+              triton_block_size))
+        elif _BlocksparseRowWiseSum.IMPLEMENTATION == "atomic_add":
+            triton_grid = lambda meta: [x_b,
+                                        triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                        triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+            (_BlocksparseRowWiseSum.kernel_blocksparse_row_wise_sum_atomic_add[triton_grid]
+             (x,
+              x_b, x_b_s, x_r_s, x_c_s,
+              sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+              output,
+              o_b, o_b_s, o_r_s,
+              s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+              sparsity_reverse_lut_output,
+              triton_block_size))
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        raise NotImplementedError
+    @staticmethod
+    @triton.jit
+    def kernel_blocksparse_row_wise_sum(x,
+                                        x_b, x_b_s, x_r_s, x_c_s,
+                                        s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c, s_l_x_c_s,
+                                        r_lut_x,
+                                        o,
+                                        o_b, o_b_s, o_r_s,
+                                        s_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+                                        sparsity_block_size,
+                                        TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+        pid_blk = tl.program_id(axis=0)
+        pid_row = tl.program_id(axis=1)
+        # Get position of current sparsity block consisting of its batch and row index
+        spa_bat_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
+        spa_bat_msk = (spa_bat_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_bat = tl.load(s_lut_o + spa_bat_idx, mask=spa_bat_msk)
+        spa_row_idx = (pid_blk * s_lut_o_r_s + 1 * s_lut_o_c_s)
+        spa_row_msk = (spa_row_idx < s_lut_o_r * s_lut_o_r_s)
+        spa_row = tl.load(s_lut_o + spa_row_idx, mask=spa_row_msk)
+        buf = tl.zeros(shape=(TRITON_BLOCK_SIZE, 1), dtype=tl.float32)
+        # Slide over triton block sized segments of input tensor
+        for i_seg_tri in range(0, tl.cdiv(s_l_x_c * sparsity_block_size, TRITON_BLOCK_SIZE)):
+            # Convert to segment index of sparsity layout
+            i_seg_spa = (i_seg_tri * TRITON_BLOCK_SIZE) // sparsity_block_size
+            # Calculate the triton segment index within a block
+            i_seg_tri_mod = i_seg_tri % (sparsity_block_size // TRITON_BLOCK_SIZE)
+            # Load reverse sparsity index for current block
+            rev_idx_spa_idx = (spa_bat * s_l_x_b_s +
+                               spa_row * s_l_x_r_s +
+                               i_seg_spa * s_l_x_c_s)
+            rev_idx_spa_msk = (rev_idx_spa_idx < s_l_x_b * s_l_x_b_s)
+            rev_idx_spa = tl.load(r_lut_x + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+            # If block is present commence operations
+            if rev_idx_spa >= 0:
+                blk_idx = ((rev_idx_spa * x_b_s) +
+                           ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                           ((i_seg_tri_mod * TRITON_BLOCK_SIZE +
+                             tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+                blk_msk = (blk_idx < x_b * x_b_s)
+                blk = tl.load(x + blk_idx, mask=blk_msk)
+                buf = buf + tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
+        o_idx = (pid_blk * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 (tl.arange(0, 1))[None, :])
+        o_msk = (o_idx < o_b * o_b_s)
+        tl.store(o + o_idx, buf, o_msk)
+    @staticmethod
+    @triton.jit
+    def kernel_blocksparse_row_wise_sum_atomic_add(x,
+                                                   x_b, x_b_s, x_r_s, x_c_s,
+                                                   s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+                                                   o,
+                                                   o_b, o_b_s, o_r_s,
+                                                   s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+                                                   r_lut_o,
+                                                   TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+        pid_blk = tl.program_id(axis=0)
+        pid_row = tl.program_id(axis=1)
+        pid_col = tl.program_id(axis=2)
+        # Get position of current sparsity block consisting of its batch and row index
+        spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
+        spa_bat_msk = (spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
+        spa_bat = tl.load(s_lut_x + spa_bat_idx, mask=spa_bat_msk)
+        spa_row_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
+        spa_row_msk = (spa_row_idx < s_lut_x_r * s_lut_x_r_s)
+        spa_row = tl.load(s_lut_x + spa_row_idx, mask=spa_row_msk)
+        # Load reverse sparsity index for current block
+        rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
+                           spa_row * s_l_o_r_s)
+        rev_idx_spa_msk = (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
+        rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+        blk_idx = ((pid_blk * x_b_s) +
+                   ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                   ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_msk = (blk_idx < x_b * x_b_s)
+        blk = tl.load(x + blk_idx, mask=blk_msk)
+        buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
+        o_idx = (rev_idx_spa * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 (tl.arange(0, 1))[None, :])
+        o_msk = (o_idx < o_b * o_b_s)
+        tl.atomic_add(o + o_idx, buf, o_msk)

blksprs/ops/softmax.py ADDED Viewed

@@ -0,0 +1,263 @@
+import torch
+import triton
+from torch import Tensor
+from triton import language as tl
+from blksprs.ops.exp import exp
+from blksprs.ops.row_wise_sum import row_wise_sum
+from blksprs.utils.tools import get_triton_block_size
+from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
+    validate_sparsity, validate_sparsity_block_size, validate_triton_block_size
+def softmax(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
+    """Computes the softmax of a block-sparse tensor in compressed form.
+    Note:
+        Sparse blocks are not considered for the calculation of the softmax, i.e., all values are assumed to be ``-inf``.
+    Args:
+        x (Tensor): A block-sparse tensor in compressed form.
+        sparsity_layout (Tensor): The sparsity layout of the block-sparse tensor.
+        sparsity_block_size (int): The size of the sparsity blocks.
+        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
+    Returns:
+        Tensor: The result of the softmax operation as a block-sparse tensor in compressed form.
+    """
+    validate_dimensions(x)
+    validate_contiguous(x)
+    validate_device(x)
+    validate_sparsity(sparsity_block_size, (x, sparsity_layout))
+    validate_sparsity_block_size(sparsity_block_size, x)
+    validate_triton_block_size(triton_block_size, sparsity_block_size)
+    if x.size(0) != 0:
+        max_val = torch.max(x).item()
+    else:
+        max_val = 0
+    x_scaled = x - max_val
+    sparsity_lut = torch.nonzero(sparsity_layout).contiguous()
+    sparsity_layout_rws, _ = torch.max(sparsity_layout, dim=-1, keepdim=True)
+    sparsity_layout_rws_flat = sparsity_layout_rws.reshape(-1)
+    sparsity_reverse_lut_rws = ((torch.cumsum(sparsity_layout_rws_flat, dim=-1) - 1) *
+                                (sparsity_layout_rws_flat == 1) -
+                                (1 * (sparsity_layout_rws_flat == 0)))
+    validate_contiguous(sparsity_layout, sparsity_lut, sparsity_reverse_lut_rws)
+    return _BlocksparseSoftmax.apply(x_scaled, sparsity_layout,
+                                     sparsity_lut,
+                                     sparsity_reverse_lut_rws,
+                                     sparsity_block_size, triton_block_size)
+class _BlocksparseSoftmax(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor, sparsity_layout: Tensor,
+                sparsity_lut: Tensor,
+                sparsity_reverse_lut_rws: Tensor,
+                sparsity_block_size: int, triton_block_size: int) -> Tensor:
+        output = torch.empty_like(x)
+        x_b, x_r, x_c = x.shape
+        x_b_s, x_r_s, x_c_s = x.stride()
+        s_lut_r, s_lut_c = sparsity_lut.shape
+        s_lut_r_s, s_lut_c_s = sparsity_lut.stride()
+        o_b, o_r, o_c = output.shape
+        x_exp = exp(x, sparsity_block_size, triton_block_size=triton_block_size)
+        x_exp_row_wise_sum, sparsity_layout_rws = row_wise_sum(x_exp, sparsity_layout, sparsity_block_size,
+                                                               flag_slice_only=True,
+                                                               triton_block_size=triton_block_size)
+        s_b, s_r, s_c = x_exp_row_wise_sum.shape
+        s_b_s, s_r_s, s_c_s = x_exp_row_wise_sum.stride()
+        s_l_s_b, s_l_s_r, s_l_s_c = sparsity_layout_rws.shape
+        s_l_s_b_s, s_l_s_r_s, s_l_s_c_s = sparsity_layout_rws.stride()
+        if triton_block_size is None:
+            triton_block_size = get_triton_block_size(sparsity_block_size)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (_BlocksparseSoftmax.kernel_blocksparse_softmax[triton_grid]
+         (x_exp,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+          x_exp_row_wise_sum, s_b, s_b_s, s_r_s, s_c_s,
+          s_l_s_b, s_l_s_b_s, s_l_s_r_s,
+          sparsity_reverse_lut_rws,
+          output,
+          triton_block_size))
+        # Save for backward pass
+        ctx.save_for_backward(output, sparsity_layout, sparsity_lut)
+        ctx.sparsity_block_size = sparsity_block_size
+        ctx.triton_block_size = triton_block_size
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        o, sparsity_layout, sparsity_lut = ctx.saved_tensors
+        sparsity_block_size = ctx.sparsity_block_size
+        triton_block_size = ctx.triton_block_size
+        s, sparsity_layout_s = row_wise_sum(grad_output * o, sparsity_layout, sparsity_block_size, flag_slice_only=True,
+                                            triton_block_size=triton_block_size)
+        sparsity_layout_s_flat = sparsity_layout_s.reshape(-1)
+        sparsity_reverse_lut_s = ((torch.cumsum(sparsity_layout_s_flat, dim=-1) - 1) *
+                                  (sparsity_layout_s_flat == 1) -
+                                  (1 * (sparsity_layout_s_flat == 0)))
+        o_b, o_r, o_c = o.size()
+        o_b_s, o_r_s, o_c_s = o.stride()
+        s_lut_r, s_lut_c = sparsity_lut.size()
+        s_lut_r_s, s_lut_c_s = sparsity_lut.stride()
+        s_b, s_r, s_c = s.size()
+        s_b_s, s_r_s, s_c_s = s.stride()
+        s_l_s_b, s_l_s_r, s_l_s_c = sparsity_layout_s.size()
+        s_l_s_b_s, s_l_s_r_s, s_l_s_c_s = sparsity_layout_s.stride()
+        grad_x = torch.empty_like(o)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (_BlocksparseSoftmax.kernel_blocksparse_softmax_grad_x[triton_grid]
+         (grad_output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          o,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+          s,
+          s_b, s_b_s, s_r_s, s_c_s,
+          s_l_s_b, s_l_s_b_s, s_l_s_r_s,
+          sparsity_reverse_lut_s,
+          grad_x,
+          o_b, o_b_s, o_r_s, o_c_s,
+          triton_block_size
+          ))
+        return grad_x, None, None, None, None, None
+    @staticmethod
+    @triton.jit
+    def kernel_blocksparse_softmax(x,
+                                   x_b, x_b_s, x_r_s, x_c_s,
+                                   s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                   s, s_b, s_b_s, s_r_s, s_c_s,
+                                   s_l_s_b, s_l_s_b_s, s_l_s_r_s,
+                                   r_lut_s,
+                                   o,
+                                   TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+        # Get triton block indices
+        pid_blk = tl.program_id(axis=0)
+        pid_row = tl.program_id(axis=1)
+        pid_col = tl.program_id(axis=2)
+        # Get position of current sparsity block consisting of its batch and row index
+        spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
+        spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+        spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
+        spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
+        spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+        spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
+        # Get reverse sparsity indices for x
+        rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
+                             spa_row * s_l_s_r_s)
+        rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
+        rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
+        if rev_idx_spa_s == -1:
+            assert False, "Invalid sparsity block"
+        # Load x block
+        blk_x_idx = ((pid_blk * x_b_s) +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+        blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+        # Load sum block
+        blk_s_idx = (rev_idx_spa_s * s_b_s +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
+                     (tl.arange(0, 1) * s_c_s)[None, :])
+        blk_s_msk = (blk_s_idx < s_b * s_b_s)
+        blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
+        # Compute softmax
+        buf = tl.div_rn(blk_x, blk_s)
+        # Store output
+        tl.store(o + blk_x_idx, buf, mask=blk_x_msk)
+    @staticmethod
+    @triton.jit
+    def kernel_blocksparse_softmax_grad_x(g,
+                                          g_b, g_b_s, g_r_s, g_c_s,
+                                          x,
+                                          x_b, x_b_s, x_r_s, x_c_s,
+                                          s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                          s,
+                                          s_b, s_b_s, s_r_s, s_c_s,
+                                          s_l_s_b, s_l_s_b_s, s_l_s_r_s,
+                                          r_lut_s,
+                                          o,
+                                          o_b, o_b_s, o_r_s, o_c_s,
+                                          TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+        # Get triton block indices
+        pid_blk = tl.program_id(axis=0)
+        pid_row = tl.program_id(axis=1)
+        pid_col = tl.program_id(axis=2)
+        # Get position of current sparsity block consisting of its batch and row index
+        spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
+        spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+        spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
+        spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
+        spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+        spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
+        rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
+                             spa_row * s_l_s_r_s)
+        rev_idx_spa_s_msk = (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
+        rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
+        blk_s_idx = (rev_idx_spa_s * s_b_s +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
+                     (tl.arange(0, 1) * s_c_s)[None, :])
+        blk_s_msk = (blk_s_idx < s_b * s_b_s)
+        blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
+        blk_g_idx = ((pid_blk * g_b_s) +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_c_s)[None, :])
+        blk_g_msk = (blk_g_idx < g_b * g_b_s)
+        blk_g = tl.load(g + blk_g_idx, mask=blk_g_msk)
+        blk_x_idx = ((pid_blk * x_b_s) +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+        blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+        buf = blk_x * (blk_g - blk_s)
+        blk_o_idx = ((pid_blk * o_b_s) +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs/ops/transpose.py ADDED Viewed

@@ -0,0 +1,154 @@
+import torch
+import triton
+from torch import Tensor
+from triton import language as tl
+from blksprs.utils.tools import get_triton_block_size
+from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
+    validate_sparsity, validate_sparsity_block_size, validate_triton_block_size
+def transpose(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int, triton_block_size: int = None) -> (
+        Tensor, Tensor):
+    """Transposes a block-sparse tensor in compressed form.
+    Note:
+         Returns the transposed tensor and the sparsity layout of the transposed tensor.
+    Args:
+        x (Tensor): A block-sparse tensor in compressed form.
+        sparsity_layout (Tensor): The sparsity layout of the block-sparse tensor.
+        sparsity_block_size (int): The size of the sparsity blocks.
+        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
+    Returns:
+        Tensor: The transposed block-sparse tensor in compressed form.
+        Tensor: The sparsity layout of the transposed tensor.
+    """
+    validate_dimensions(x)
+    validate_contiguous(x)
+    validate_device(x)
+    validate_sparsity(sparsity_block_size, (x, sparsity_layout))
+    validate_sparsity_block_size(sparsity_block_size, x)
+    validate_triton_block_size(triton_block_size, sparsity_block_size)
+    sparsity_layout_t = sparsity_layout.transpose(-1, -2).contiguous()
+    sparsity_lut = torch.nonzero(sparsity_layout_t).contiguous()
+    sparsity_layout_flat = sparsity_layout.reshape(-1)
+    sparsity_reverse_lut = (((torch.cumsum(sparsity_layout_flat, dim=-1) - 1) *
+                             (sparsity_layout_flat == 1) -
+                             (1 * (sparsity_layout_flat == 0)))
+                            .reshape(sparsity_layout.size()).transpose(-1, -2).contiguous().reshape(-1))
+    n_sparse_blocks = torch.sum(sparsity_layout.to(torch.int)).item()
+    validate_contiguous(sparsity_layout_t, sparsity_lut, sparsity_reverse_lut)
+    return _BlocksparseTranspose.apply(x, sparsity_layout_t, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+                                       n_sparse_blocks, triton_block_size), sparsity_layout_t
+class _BlocksparseTranspose(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor,
+                sparsity_layout: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor, sparsity_block_size: int,
+                n_sparse_blocks: int, triton_block_size: int) -> (Tensor, Tensor):
+        output = torch.empty(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size), device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = x.stride()
+        s_l_b, s_l_r, s_l_c = sparsity_layout.size()
+        s_l_b_s, s_l_r_s, s_l_c_s = sparsity_layout.stride()
+        s_lut_r, s_lut_c = sparsity_lut.shape
+        s_lut_r_s, s_lut_c_s = sparsity_lut.stride()
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = output.stride()
+        if triton_block_size is None:
+            triton_block_size = get_triton_block_size(sparsity_block_size)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (_BlocksparseTranspose.kernel_blocksparse_transpose[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          s_l_b, s_l_b_s, s_l_r_s, s_l_c_s,
+          sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+          sparsity_reverse_lut,
+          output,
+          o_b, o_b_s,
+          triton_block_size))
+        # Save for backward pass
+        ctx.save_for_backward(sparsity_layout)
+        ctx.sparsity_layout = sparsity_layout
+        ctx.sparsity_block_size = sparsity_block_size
+        ctx.triton_block_size = triton_block_size
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        sparsity_layout = ctx.saved_tensors[0]
+        sparsity_block_size = ctx.sparsity_block_size
+        triton_block_size = ctx.triton_block_size
+        return transpose(grad_output, sparsity_layout, sparsity_block_size, triton_block_size)[0], None, None, None, None, None, None
+    @staticmethod
+    @triton.jit
+    def kernel_blocksparse_transpose(x,
+                                     x_b, x_b_s, x_r_s, x_c_s,
+                                     s_l_b, s_l_b_s, s_l_r_s, s_l_c_s,
+                                     s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                     r_lut,
+                                     o,
+                                     o_b, o_b_s,
+                                     TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+        # Get triton block indices
+        pid_blk = tl.program_id(axis=0)
+        pid_row = tl.program_id(axis=1)
+        pid_col = tl.program_id(axis=2)
+        # Get sparsity index of current output block consisting of its batch, row, and column index
+        spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
+        spa_bat_msk = (spa_bat_idx < s_lut_r * s_lut_r_s)
+        spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
+        spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
+        spa_row_msk = (spa_row_idx < s_lut_r * s_lut_r_s)
+        spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
+        spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
+        spa_col_msk = (spa_col_idx < s_lut_r * s_lut_r_s)
+        spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
+        # Get reverse sparsity indices
+        rev_idx_spa_idx = (spa_bat * s_l_b_s +
+                           spa_row * s_l_r_s +
+                           spa_col * s_l_c_s)
+        rev_idx_spa_msk = (rev_idx_spa_idx < s_l_b * s_l_b_s)
+        rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
+        if rev_idx_spa == -1:
+            assert False, "Invalid sparsity block"
+        blk_x_idx = (rev_idx_spa * x_b_s +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx < x_b * x_b_s)
+        blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+        blk_x_t = tl.trans(blk_x)
+        blk_o_idx = (pid_blk * o_b_s +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_o_msk = (blk_o_idx < o_b * o_b_s)
+        tl.store(o + blk_o_idx, blk_x_t, mask=blk_o_msk)

blksprs/utils/tools.py ADDED Viewed

@@ -0,0 +1,20 @@
+import torch
+from torch import Tensor, Size
+def do_shape_blocksparse(x: Tensor):
+    if x.dim() == 3:
+        return x, x.size()
+    return x.reshape(-1, x.size(-2), x.size(-1)), x.size()
+def undo_shape_blocksparse(x: Tensor, shape: Size):
+    if x.shape[-2:] == shape[-2:]:
+        return x
+    return x.reshape((*shape[:-2], *x.shape[-2:]))
+def get_triton_block_size(sparsity_block_size: int, limit: int = 128):
+    return min(sparsity_block_size, limit)

blksprs 0.2b4__py3-none-any.whl → 1.1__py3-none-any.whl

blksprs 0.2b4py3-none-any.whl → 1.1py3-none-any.whl