PyPI - blksprs - Versions diffs - 1.10.1__py3-none-any.whl → 1.11__py3-none-any.whl - Mend

blksprs 1.10.1py3-none-any.whl → 1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

blksprs/__init__.py +0 -1
blksprs/ops/conversion.py +42 -15
blksprs/ops/distribution.py +60 -30
blksprs/ops/flow.py +63 -31
blksprs/ops/matmul.py +40 -22
blksprs/ops/partitioning.py +102 -59
blksprs/ops/repeat.py +88 -76
blksprs/ops/softmax.py +71 -63
blksprs/ops/transpose.py +38 -101
blksprs/utils/tools.py +7 -1
{blksprs-1.10.1.dist-info → blksprs-1.11.dist-info}/METADATA +2 -2
blksprs-1.11.dist-info/RECORD +23 -0
{blksprs-1.10.1.dist-info → blksprs-1.11.dist-info}/WHEEL +1 -1
blksprs/ops/misc/exp.py +0 -104
blksprs-1.10.1.dist-info/RECORD +0 -24
{blksprs-1.10.1.dist-info → blksprs-1.11.dist-info}/top_level.txt +0 -0

blksprs/ops/softmax.py CHANGED Viewed

@@ -3,7 +3,6 @@ import triton
 from torch import Tensor
 from triton import language as tl
-from blksprs.ops.misc.exp import exp
 from blksprs.ops.misc.row_wise import row_wise_sum, row_wise_max, row_wise_sub
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import get_triton_block_size, stride
@@ -12,7 +11,7 @@ from blksprs.utils.validation import validate_contiguous, validate_dimensions, v
 def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
-            triton_block_size: int = None) -> BlksprsTensor:
+            triton_block_size: int = None, lut: dict = None) -> BlksprsTensor:
     """Computes the softmax of a block-sparse tensor in compressed form.
     Note:
@@ -23,6 +22,7 @@ def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
         sparsity_layout (Tensor): The sparsity layout of the block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
         triton_block_size (int): The block size to use for the triton kernel (default ``None``).
+        lut (dict, optional): A dictionary containing the look-up tables for the operation (default ``None``).
     Returns:
         BlksprsTensor: The result of the softmax operation as a block-sparse tensor in compressed form.
@@ -37,24 +37,38 @@ def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
     validate_sparsity_block_size(sparsity_block_size, x)
     validate_triton_block_size(triton_block_size, sparsity_block_size)
-    sparsity_lut = torch.nonzero(sparsity_layout).contiguous()
-    sparsity_layout_rws, _ = torch.max(sparsity_layout, dim=-1, keepdim=True)
-    sparsity_layout_rws_flat = sparsity_layout_rws.reshape(-1)
-    sparsity_reverse_lut_rws = ((torch.cumsum(sparsity_layout_rws_flat, dim=-1) - 1) *
-                                (sparsity_layout_rws_flat == 1) -
-                                (1 * (sparsity_layout_rws_flat == 0)))
-    validate_contiguous(sparsity_layout, sparsity_lut, sparsity_reverse_lut_rws)
+    lut = _BlocksparseSoftmax.build_lut(lut, sparsity_layout)
     return BlksprsTensor(_BlocksparseSoftmax.apply(x, sparsity_layout,
-                                                   sparsity_lut,
-                                                   sparsity_reverse_lut_rws,
+                                                   lut["sparsity_lut"],
+                                                   lut["sparsity_reverse_lut_rws"],
                                                    sparsity_block_size, triton_block_size))
 class _BlocksparseSoftmax(torch.autograd.Function):
+    @staticmethod
+    def build_lut(lut: dict, sparsity_layout: Tensor):
+        if lut is None:
+            lut = dict()
+        if "sparsity_lut" not in lut:
+            sparsity_lut = torch.nonzero(sparsity_layout).contiguous()
+            lut["sparsity_lut"] = sparsity_lut
+        if "sparsity_reverse_lut_rws" not in lut:
+            sparsity_layout_rws, _ = torch.max(sparsity_layout, dim=-1, keepdim=True)
+            sparsity_layout_rws_flat = sparsity_layout_rws.reshape(-1)
+            sparsity_reverse_lut_rws = ((torch.cumsum(sparsity_layout_rws_flat, dim=-1) - 1) *
+                                        (sparsity_layout_rws_flat == 1) -
+                                        (1 * (sparsity_layout_rws_flat == 0)))
+            lut["sparsity_reverse_lut_rws"] = sparsity_reverse_lut_rws
+        validate_contiguous(sparsity_layout, lut["sparsity_lut"], lut["sparsity_reverse_lut_rws"])
+        return lut
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout: Tensor,
                 sparsity_lut: Tensor,
@@ -72,7 +86,7 @@ class _BlocksparseSoftmax(torch.autograd.Function):
                                                            flag_slice_only=True,
                                                            triton_block_size=triton_block_size)
         x_scaled = row_wise_sub(x, sparsity_layout, x_row_wise_max, sparsity_block_size, triton_block_size)
-        x_exp = exp(x_scaled, sparsity_block_size, triton_block_size=triton_block_size)
+        x_exp = torch.exp(x_scaled)
         x_exp_row_wise_sum, sparsity_layout_rws = row_wise_sum(x_exp, sparsity_layout, sparsity_block_size,
                                                                flag_slice_only=True,
                                                                triton_block_size=triton_block_size)
@@ -182,29 +196,26 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
         rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
-        if rev_idx_spa_s == -1:
-            tl.device_assert(False)
-            return
-        # Load x block
-        blk_x_idx = ((pid_blk * x_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
-        blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+        if rev_idx_spa_s >= 0:
+            # Load x block
+            blk_x_idx = ((pid_blk * x_b_s) +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                         ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+            blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
+            blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
-        # Load sum block
-        blk_s_idx = (rev_idx_spa_s * s_b_s +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
-                     (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < s_b * s_b_s)
-        blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
+            # Load sum block
+            blk_s_idx = (rev_idx_spa_s * s_b_s +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
+                         (tl.arange(0, 1) * s_c_s)[None, :])
+            blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < s_b * s_b_s)
+            blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
-        # Compute softmax
-        buf = tl.div_rn(blk_x, blk_s)
+            # Compute softmax
+            buf = tl.div_rn(blk_x, blk_s)
-        # Store output
-        tl.store(o + blk_x_idx, buf, mask=blk_x_msk)
+            # Store output
+            tl.store(o + blk_x_idx, buf, mask=blk_x_msk)
     @staticmethod
     @triton.jit
@@ -239,32 +250,29 @@ class _BlocksparseSoftmax(torch.autograd.Function):
         rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
         rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
-        if rev_idx_spa_s == -1:
-            tl.device_assert(False)
-            return
-        blk_s_idx = (rev_idx_spa_s * s_b_s +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
-                     (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < s_b * s_b_s)
-        blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
-        blk_g_idx = ((pid_blk * g_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_r_s)[:, None] +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_c_s)[None, :])
-        blk_g_msk = (blk_g_idx >= 0 and blk_g_idx < g_b * g_b_s)
-        blk_g = tl.load(g + blk_g_idx, mask=blk_g_msk)
-        blk_x_idx = ((pid_blk * x_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
-        blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
-        buf = blk_x * (blk_g - blk_s)
-        blk_o_idx = ((pid_blk * o_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
-        tl.store(o + blk_o_idx, buf, mask=blk_o_msk)
+        if rev_idx_spa_s >= 0:
+            blk_s_idx = (rev_idx_spa_s * s_b_s +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
+                         (tl.arange(0, 1) * s_c_s)[None, :])
+            blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < s_b * s_b_s)
+            blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
+            blk_g_idx = ((pid_blk * g_b_s) +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_r_s)[:, None] +
+                         ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_c_s)[None, :])
+            blk_g_msk = (blk_g_idx >= 0 and blk_g_idx < g_b * g_b_s)
+            blk_g = tl.load(g + blk_g_idx, mask=blk_g_msk)
+            blk_x_idx = ((pid_blk * x_b_s) +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                         ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+            blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
+            blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
+            buf = blk_x * (blk_g - blk_s)
+            blk_o_idx = ((pid_blk * o_b_s) +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                         ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+            blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
+            tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs/ops/transpose.py CHANGED Viewed

@@ -3,14 +3,15 @@ import triton
 from torch import Tensor
 from triton import language as tl
+from blksprs.ops.flow import flow_forward_pull
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import get_triton_block_size, stride
 from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_triton_block_size
-def transpose(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int, triton_block_size: int = None) -> (
-        BlksprsTensor, Tensor):
+def transpose(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int, triton_block_size: int = None,
+              lut: dict = None) -> (BlksprsTensor, Tensor):
     """Transposes a block-sparse tensor in compressed form.
     Note:
@@ -21,6 +22,7 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: in
         sparsity_layout (Tensor): The sparsity layout of the block-sparse tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
         triton_block_size (int): The block size to use for the triton kernel (default ``None``).
+        lut (dict, optional): A dictionary containing the look-up tables for the operation (default ``None``).
     Returns:
         BlksprsTensor: The transposed block-sparse tensor in compressed form.
@@ -28,6 +30,7 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: in
     """
     x = x.contiguous()
+    x_t = x.transpose(-1, -2).contiguous()
     validate_dimensions(x)
     validate_contiguous(x)
@@ -36,66 +39,53 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: in
     validate_sparsity_block_size(sparsity_block_size, x)
     validate_triton_block_size(triton_block_size, sparsity_block_size)
-    sparsity_layout_t = sparsity_layout.transpose(-1, -2).contiguous()
+    lut = _BlocksparseTranspose.build_lut(lut, sparsity_layout)
-    sparsity_lut = torch.nonzero(sparsity_layout_t).contiguous()
+    return BlksprsTensor(
+        _BlocksparseTranspose.apply(x_t, lut["sparsity_layout_t"], lut["sparsity_lut"], lut["sparsity_reverse_lut"],
+                                    sparsity_block_size,
+                                    lut["n_sparse_blocks"], triton_block_size)), lut["sparsity_layout_t"]
-    sparsity_layout_flat = sparsity_layout.reshape(-1)
-    sparsity_reverse_lut = (((torch.cumsum(sparsity_layout_flat, dim=-1) - 1) *
-                             (sparsity_layout_flat == 1) -
-                             (1 * (sparsity_layout_flat == 0)))
-                            .reshape(sparsity_layout.size()).transpose(-1, -2).contiguous().reshape(-1))
-    n_sparse_blocks = torch.sum(sparsity_layout.to(torch.int)).item()
+class _BlocksparseTranspose(torch.autograd.Function):
-    validate_contiguous(sparsity_layout_t, sparsity_lut, sparsity_reverse_lut)
+    @staticmethod
+    def build_lut(lut: dict, sparsity_layout: Tensor):
+        if lut is None:
+            lut = dict()
-    return BlksprsTensor(
-        _BlocksparseTranspose.apply(x, sparsity_layout_t, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
-                                    n_sparse_blocks, triton_block_size)), sparsity_layout_t
+        if "sparsity_layout_t" not in lut:
+            sparsity_layout_t = sparsity_layout.transpose(-1, -2).contiguous()
+            lut["sparsity_layout_t"] = sparsity_layout_t
+        if "sparsity_lut" not in lut:
+            sparsity_lut = torch.nonzero(lut["sparsity_layout_t"]).contiguous()
+            lut["sparsity_lut"] = sparsity_lut
-class _BlocksparseTranspose(torch.autograd.Function):
+        if "sparsity_reverse_lut" not in lut:
+            sparsity_layout_flat = sparsity_layout.reshape(-1)
+            sparsity_reverse_lut = (((torch.cumsum(sparsity_layout_flat, dim=-1) - 1) *
+                                     (sparsity_layout_flat == 1) -
+                                     (1 * (sparsity_layout_flat == 0)))
+                                    .reshape(sparsity_layout.size()).transpose(-1, -2).contiguous().reshape(-1))
+            lut["sparsity_reverse_lut"] = sparsity_reverse_lut
+        if "n_sparse_blocks" not in lut:
+            n_sparse_blocks = torch.sum(sparsity_layout.to(torch.int)).item()
+            lut["n_sparse_blocks"] = n_sparse_blocks
+        validate_contiguous(lut["sparsity_layout_t"], lut["sparsity_lut"], lut["sparsity_reverse_lut"])
+        return lut
     @staticmethod
     def forward(ctx, x: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
                 sparsity_block_size: int,
                 n_sparse_blocks: int, triton_block_size: int) -> Tensor:
-        output = torch.empty(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
-                             dtype=x.dtype, device=x.device)
-        x_b, x_r, x_c = x.size()
-        x_b_s, x_r_s, x_c_s = stride(x)
-        s_l_b, s_l_r, s_l_c = sparsity_layout_o.size()
-        s_l_b_s, s_l_r_s, s_l_c_s = stride(sparsity_layout_o)
-        s_lut_r, s_lut_c = sparsity_lut.shape
-        s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
-        o_b, o_r, o_c = output.size()
-        o_b_s, o_r_s, o_c_s = stride(output)
-        if triton_block_size is None:
-            triton_block_size = get_triton_block_size(sparsity_block_size)
-        triton_grid = lambda meta: [o_b,
-                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-        (_BlocksparseTranspose.kernel_blocksparse_transpose[triton_grid]
-         (x,
-          x_b, x_b_s, x_r_s, x_c_s,
-          s_l_b, s_l_b_s, s_l_r_s, s_l_c_s,
-          sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-          sparsity_reverse_lut,
-          output,
-          o_b, o_b_s,
-          triton_block_size))
-        # Save for backward pass
         ctx.save_for_backward(sparsity_layout_o)
-        ctx.sparsity_block_size = sparsity_block_size
-        ctx.triton_block_size = triton_block_size
-        return output
+        return flow_forward_pull(ctx, x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut,
+                                 sparsity_block_size, n_sparse_blocks, triton_block_size)
     @staticmethod
     def backward(ctx, grad_output):
@@ -105,56 +95,3 @@ class _BlocksparseTranspose(torch.autograd.Function):
         return transpose(grad_output, sparsity_layout, sparsity_block_size, triton_block_size)[
             0], None, None, None, None, None, None
-    @staticmethod
-    @triton.jit
-    def kernel_blocksparse_transpose(x,
-                                     x_b, x_b_s, x_r_s, x_c_s,
-                                     s_l_b, s_l_b_s, s_l_r_s, s_l_c_s,
-                                     s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-                                     r_lut,
-                                     o,
-                                     o_b, o_b_s,
-                                     TRITON_BLOCK_SIZE: tl.constexpr) -> None:
-        # Get triton block indices
-        pid_blk = tl.program_id(axis=0)
-        pid_row = tl.program_id(axis=1)
-        pid_col = tl.program_id(axis=2)
-        # Get sparsity index of current output block consisting of its batch, row, and column index
-        spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-        spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
-        spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
-        spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-        spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
-        spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
-        spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
-        spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
-        spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
-        # Get reverse sparsity index
-        rev_idx_spa_idx = (spa_bat * s_l_b_s +
-                           spa_row * s_l_r_s +
-                           spa_col * s_l_c_s)
-        rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_b * s_l_b_s)
-        rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
-        if rev_idx_spa == -1:
-            tl.device_assert(False)
-            return
-        blk_x_idx = (rev_idx_spa * x_b_s +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
-        blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
-        blk_x_t = tl.trans(blk_x)
-        blk_o_idx = (pid_blk * o_b_s +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
-        tl.store(o + blk_o_idx, blk_x_t, mask=blk_o_msk)

blksprs/utils/tools.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import torch
 from torch import Tensor, Size
@@ -20,4 +21,9 @@ def get_triton_block_size(sparsity_block_size: int, limit: int = 128):
 def stride(x: Tensor):
-    return x.view(x.shape).stride()
+    if x.dim() == 2:
+        return x.size(1), 1
+    elif x.dim() == 3:
+        return x.size(1) * x.size(2), x.size(2), 1
+    else:
+        raise NotImplementedError

{blksprs-1.10.1.dist-info → blksprs-1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: blksprs
-Version: 1.10.1
+Version: 1.11
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs

blksprs-1.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+blksprs/__init__.py,sha256=AJYVfR40nOfE5F3waHPVSuajwYDcoGkiEQc8HhQbUBU,1721
+blksprs/layouting/distribution_layout.py,sha256=xDGY5-J7uSD8oenlf8bEJ2amMiQG3NBf2klTTydbTJE,5140
+blksprs/layouting/sparsity_layout.py,sha256=IVtHc_nN3ZM2y4GFcys70PqDWmWc7tkHlVGlToErANk,9894
+blksprs/ops/conversion.py,sha256=QFtZ-nmY2JAWutheiO07vatXqz3eSZBP5Ym_U2Q1oWk,23299
+blksprs/ops/distribution.py,sha256=nHTuE7Tq0Q404VN8bWNC2sEwmmdAtgZI6I7auRICdps,21749
+blksprs/ops/flow.py,sha256=7tOXfTBKOAixYmDa_VXg7TwviLV5ZQMHQjtbyOjqA00,7879
+blksprs/ops/matmul.py,sha256=eVj_BGj78bJkXYuvw4KctMfcfveQBt5OdYmeXzdpO88,12631
+blksprs/ops/partitioning.py,sha256=qMv9w3yFWXwXIhIppdcJ_JMsoZ25HCH38vb6GRneoLM,10416
+blksprs/ops/repeat.py,sha256=i824ijprfYpCaEjiSG5FTUZz7wMS5ksVy_-vY7ZX8Fg,9729
+blksprs/ops/softmax.py,sha256=_mGkA2jHN8cXwtWXYswobEPyM7UC0JyzRszoE4ZYs7w,13063
+blksprs/ops/transpose.py,sha256=O1XhGIGiVkhOSKcBD0HrYaeK6HmpvEEzLb7zJl7xsIM,4246
+blksprs/ops/misc/broadcast_ops.py,sha256=pv0nssSDOdDbQFttpqUIs2ZXShqfm2RYCfJH-C5x3H0,5544
+blksprs/ops/misc/row_wise.py,sha256=DnV5-xEJUbqZlK2fETwHiPQDUMwT-lkc0VUhBlnJ5Y0,17458
+blksprs/utils/benchmarking.py,sha256=4pLVlnPW_2EM-NT3n4SClaRznVYEljztLbJcccz8kZE,1360
+blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
+blksprs/utils/layout_utils.py,sha256=49ZdPS_gMn_IrWty3FARbi2rda5a8g5DmAEL8LOrC30,670
+blksprs/utils/processing.py,sha256=WLuMJQ8v-YovXwcDjhlDn3N31WMZXrtyeeyKSgq_zn4,3642
+blksprs/utils/tools.py,sha256=k2OfEplbQiAwVjP84zZf7SNB8FzvMtOFBL9sC98OCbI,683
+blksprs/utils/validation.py,sha256=CbxBbeQWJo8wox5eMoVzaTlP9FVBwt3-gxUOmi3EUgw,4213
+blksprs-1.11.dist-info/METADATA,sha256=NUEiHexWiFNbMxQI2TUEzMw9iGBhxqflhWr2xCgOw28,9105
+blksprs-1.11.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+blksprs-1.11.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
+blksprs-1.11.dist-info/RECORD,,

{blksprs-1.10.1.dist-info → blksprs-1.11.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

blksprs/ops/misc/exp.py DELETED Viewed

@@ -1,104 +0,0 @@
-import torch
-import triton
-from torch import Tensor
-from triton import language as tl
-from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import get_triton_block_size, stride
-from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
-    validate_sparsity_block_size, validate_triton_block_size
-def exp(x: BlksprsTensor, sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
-    """Applies the element-wise exponential function to a block-sparse tensor.
-    Note:
-        This operation does not consider sparse blocks, i.e., these will not be set to ``e^0``.
-        Consider this when converting back to tensors in regular form.
-    Args:
-        x (BlksprsTensor): A block-sparse tensor in compressed form.
-        sparsity_block_size (int): The size of the sparsity blocks.
-        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
-    Returns:
-        BlksprsTensor: The exponential function applied to all elements of the input tensor as a block-sparse tensor in
-            compressed form.
-    """
-    x = x.contiguous()
-    validate_dimensions(x)
-    validate_contiguous(x)
-    validate_device(x)
-    validate_sparsity_block_size(sparsity_block_size, x)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
-    return BlksprsTensor(_BlocksparseExp.apply(x, sparsity_block_size, triton_block_size))
-class _BlocksparseExp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, sparsity_block_size: int, triton_block_size: int) -> Tensor:
-        output = torch.empty_like(x)
-        x_b, x_r, x_c = x.shape
-        x_b_s, x_r_s, x_c_s = stride(x)
-        o_b, o_r, o_c = output.shape
-        o_b_s, o_r_s, o_c_s = stride(output)
-        if triton_block_size is None:
-            triton_block_size = get_triton_block_size(sparsity_block_size)
-        triton_grid = lambda meta: [o_b,
-                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-        (_BlocksparseExp.kernel_blocksparse_exp[triton_grid]
-         (x,
-          x_b, x_b_s, x_r_s, x_c_s,
-          output,
-          o_b, o_b_s, o_r_s, o_c_s,
-          triton_block_size))
-        ctx.save_for_backward(output)
-        return output
-    @staticmethod
-    def backward(ctx, grad_output):
-        o = ctx.saved_tensors[0]
-        grad_x = torch.mul(grad_output, o)
-        return grad_x, None, None
-    @staticmethod
-    @triton.jit
-    def kernel_blocksparse_exp(x,
-                               x_b, x_b_s, x_r_s, x_c_s,
-                               o,
-                               o_b, o_b_s, o_r_s, o_c_s,
-                               TRITON_BLOCK_SIZE: tl.constexpr) -> None:
-        # Get triton block indices
-        pid_blk = tl.program_id(axis=0)
-        pid_row = tl.program_id(axis=1)
-        pid_col = tl.program_id(axis=2)
-        # Load block
-        blk_x_idx = ((pid_blk * x_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
-        blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
-        # Compute exp
-        buf = tl.exp(blk_x)
-        # Store block
-        blk_o_idx = ((pid_blk * o_b_s) +
-                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
-        tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs-1.10.1.dist-info/RECORD DELETED Viewed

@@ -1,24 +0,0 @@
-blksprs/__init__.py,sha256=wnpk-20jXq7xV0xa-WpHfPQuauI2gEZz9sH-0blKxP0,1766
-blksprs/layouting/distribution_layout.py,sha256=xDGY5-J7uSD8oenlf8bEJ2amMiQG3NBf2klTTydbTJE,5140
-blksprs/layouting/sparsity_layout.py,sha256=IVtHc_nN3ZM2y4GFcys70PqDWmWc7tkHlVGlToErANk,9894
-blksprs/ops/conversion.py,sha256=NK5uXMepPJ9yYh0vnxKwx5_Ffj_bAvhqPVogf_7PY0g,22248
-blksprs/ops/distribution.py,sha256=qK5t5XgQSJxXPced8RohprqCtUMMTaEP2pFm3KU1c8o,20267
-blksprs/ops/flow.py,sha256=SWHDQ5zx0cjnPR0CcAcRNZdSusSAHSU840SwDNUr24g,6437
-blksprs/ops/matmul.py,sha256=LAQyPNwWVmBMRnAex3msLSPD_aG5SblLCMiutJWqvus,11632
-blksprs/ops/partitioning.py,sha256=ugKnpvH36ND7qeJQp56M74qqfACkzcTVuXebzw__28Y,8286
-blksprs/ops/repeat.py,sha256=RCa-dITomA5v12K5Oxa5_ReA361zS7WHPNNHxSp9PGw,8578
-blksprs/ops/softmax.py,sha256=i8NJhvPRYya94AzpN6qiki6_G9KfDrtPifhWd7wbYzk,12496
-blksprs/ops/transpose.py,sha256=oAtUu7QzQnNAH3lvRs_MIvIKpBu9h74f9Sk07AxKnDM,6991
-blksprs/ops/misc/broadcast_ops.py,sha256=pv0nssSDOdDbQFttpqUIs2ZXShqfm2RYCfJH-C5x3H0,5544
-blksprs/ops/misc/exp.py,sha256=ygfw7oD6ALdPwNQX_HelKgO8I3-LCgIXH_x0gWzkUN8,3840
-blksprs/ops/misc/row_wise.py,sha256=DnV5-xEJUbqZlK2fETwHiPQDUMwT-lkc0VUhBlnJ5Y0,17458
-blksprs/utils/benchmarking.py,sha256=4pLVlnPW_2EM-NT3n4SClaRznVYEljztLbJcccz8kZE,1360
-blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
-blksprs/utils/layout_utils.py,sha256=49ZdPS_gMn_IrWty3FARbi2rda5a8g5DmAEL8LOrC30,670
-blksprs/utils/processing.py,sha256=WLuMJQ8v-YovXwcDjhlDn3N31WMZXrtyeeyKSgq_zn4,3642
-blksprs/utils/tools.py,sha256=r7Y4C37vfSWUyQTGwa8NyRqgovmsq9hMufkenqYHOxo,539
-blksprs/utils/validation.py,sha256=CbxBbeQWJo8wox5eMoVzaTlP9FVBwt3-gxUOmi3EUgw,4213
-blksprs-1.10.1.dist-info/METADATA,sha256=5in6lYCZo1bd8urYR0wkTxIiTTAIAANukLpKeZfGasY,9107
-blksprs-1.10.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-blksprs-1.10.1.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
-blksprs-1.10.1.dist-info/RECORD,,

{blksprs-1.10.1.dist-info → blksprs-1.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

blksprs 1.10.1__py3-none-any.whl → 1.11__py3-none-any.whl

blksprs 1.10.1py3-none-any.whl → 1.11py3-none-any.whl