PyPI - blksprs - Versions diffs - 1.11__py3-none-any.whl → 2.0__py3-none-any.whl - Mend

blksprs 1.11py3-none-any.whl → 2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

blksprs/__init__.py +4 -5
blksprs/layouting/distribution_layout.py +64 -48
blksprs/layouting/sparsity_layout.py +96 -72
blksprs/ops/conversion.py +349 -338
blksprs/ops/distribution.py +318 -294
blksprs/ops/flow.py +113 -100
blksprs/ops/matmul.py +187 -172
blksprs/ops/misc/broadcast_ops.py +68 -53
blksprs/ops/misc/row_wise.py +223 -176
blksprs/ops/partitioning.py +140 -132
blksprs/ops/repeat.py +118 -120
blksprs/ops/softmax.py +240 -214
blksprs/ops/transpose.py +55 -52
blksprs/utils/autotuning.py +78 -0
blksprs/utils/benchmarking.py +3 -3
blksprs/utils/processing.py +2 -1
blksprs/utils/tools.py +5 -6
blksprs/utils/validation.py +22 -16
{blksprs-1.11.dist-info → blksprs-2.0.dist-info}/METADATA +55 -36
blksprs-2.0.dist-info/RECORD +23 -0
{blksprs-1.11.dist-info → blksprs-2.0.dist-info}/WHEEL +1 -1
blksprs/utils/layout_utils.py +0 -17
blksprs-1.11.dist-info/RECORD +0 -23
{blksprs-1.11.dist-info → blksprs-2.0.dist-info}/top_level.txt +0 -0

blksprs/ops/misc/row_wise.py CHANGED Viewed

@@ -1,16 +1,19 @@
 import torch
 import triton
 from torch import Tensor
+from torch._library.triton import wrap_triton, triton_op
 from triton import language as tl
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import get_triton_block_size, stride
+from blksprs.utils.tools import stride
 from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, validate_sparsity, \
-    validate_sparsity_block_size, validate_triton_block_size
+    validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
-                 flag_slice_only: bool = False, triton_block_size: int = None) -> (BlksprsTensor, Tensor):
+                 flag_slice_only: bool = False) -> (BlksprsTensor, Tensor):
     """Computes the row-wise sum of a block-sparse tensor.
     Returns a block-sparse tensor in compressed form with only one block per row, where the first entry contains the sum
@@ -25,7 +28,6 @@ def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
         sparsity_block_size (int): The size of the sparsity blocks.
         flag_slice_only (bool, optional): If set the output will be of shape ``[x.size(0), x.size(1), 1]``
             (default ``False``).
-        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
     Returns:
         tuple[BlksprsTensor, Tensor]: A tuple containing a block-sparse tensor in compressed form containing the row-wise sum
@@ -39,7 +41,6 @@ def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
     validate_device(x)
     validate_sparsity(sparsity_block_size, (x, sparsity_layout))
     validate_sparsity_block_size(sparsity_block_size, x)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
     sparsity_lut = torch.nonzero(sparsity_layout).contiguous()
@@ -54,50 +55,65 @@ def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
     validate_contiguous(sparsity_layout, sparsity_lut,
                         sparsity_layout_output, sparsity_reverse_lut_output)
-    output = torch.zeros(size=(n_sparse_blocks_output,
-                               sparsity_block_size,
-                               1 if flag_slice_only else sparsity_block_size),
-                         dtype=x.dtype,
-                         device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_x_r, s_lut_x_c = sparsity_lut.size()
-    s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
-    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_blocksparse_row_wise_sum[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
-      output,
-      o_b, o_b_s, o_r_s,
-      s_l_o_b, s_l_o_b_s, s_l_o_r_s,
-      sparsity_reverse_lut_output,
-      triton_block_size))
-    return BlksprsTensor(output), sparsity_layout_output
+    return BlksprsTensor(row_wise_sum_forward(
+        x, sparsity_lut, sparsity_layout_output, sparsity_reverse_lut_output,
+        sparsity_block_size, n_sparse_blocks_output, flag_slice_only)), sparsity_layout_output
+@triton_op("blksprs::row_wise_sum_forward", mutates_args={})
+def row_wise_sum_forward(x: Tensor, sparsity_lut: Tensor,
+                         sparsity_layout_output: Tensor, sparsity_reverse_lut_output: Tensor,
+                         sparsity_block_size: int, n_sparse_blocks_output: int,
+                         flag_slice_only: bool = False) -> Tensor:
+    with torch.no_grad():
+        output = torch.zeros(
+            size=(n_sparse_blocks_output, sparsity_block_size, 1 if flag_slice_only else sparsity_block_size),
+            dtype=x.dtype, device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_lut_x_r, s_lut_x_c = sparsity_lut.size()
+        s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
+        s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
+        triton_grid = lambda meta: [x_b,
+                                    triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(row_wise_sum_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          output,
+          o_b, o_b_s, o_r_s,
+          s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+          sparsity_reverse_lut_output,
+          sparsity_block_size))
+        return output
+# noinspection PyUnusedLocal
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
+    reset_to_zero=["o"]
+)
 @triton.jit
-def kernel_blocksparse_row_wise_sum(x,
-                                    x_b, x_b_s, x_r_s, x_c_s,
-                                    s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
-                                    o,
-                                    o_b, o_b_s, o_r_s,
-                                    s_l_o_b, s_l_o_b_s, s_l_o_r_s,
-                                    r_lut_o,
-                                    TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+def row_wise_sum_kernel(x,
+                        x_b, x_b_s, x_r_s, x_c_s,
+                        s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+                        o,
+                        o_b, o_b_s, o_r_s,
+                        s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+                        r_lut_o,
+                        sparsity_block_size,
+                        TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+    # Get triton block indices
     pid_blk = tl.program_id(axis=0)
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
@@ -117,27 +133,27 @@ def kernel_blocksparse_row_wise_sum(x,
     rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
-    if rev_idx_spa == -1:
-        tl.device_assert(False)
-        return
-    blk_idx = ((pid_blk * x_b_s) +
-               ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-               ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = (blk_idx >= 0 and blk_idx < x_b * x_b_s)
-    blk = tl.load(x + blk_idx, mask=blk_msk)
+    if rev_idx_spa >= 0:
+        blk_idx = ((pid_blk * x_b_s) +
+                   ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                   ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_msk = (blk_idx >= 0 and
+                   blk_idx < x_b * x_b_s)
+        blk = tl.load(x + blk_idx, mask=blk_msk)
-    buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
+        buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
-    o_idx = (rev_idx_spa * o_b_s +
-             ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-             (tl.arange(0, 1))[None, :])
-    o_msk = (o_idx >= 0 and o_idx < o_b * o_b_s)
-    tl.atomic_add(o + o_idx, buf, o_msk)
+        o_idx = (rev_idx_spa * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 (tl.arange(0, 1))[None, :])
+        o_msk = (o_idx >= 0 and
+                 o_idx < o_b * o_b_s)
+        tl.atomic_add(o + o_idx, buf, o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
-                 flag_slice_only: bool = False, triton_block_size: int = None) -> (BlksprsTensor, Tensor):
+                 flag_slice_only: bool = False) -> (BlksprsTensor, Tensor):
     """Computes the row-wise max of a block-sparse tensor.
     Returns a block-sparse tensor in compressed form with only one block per row, where the first entry contains the
@@ -152,13 +168,14 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
         sparsity_block_size (int): The size of the sparsity blocks.
         flag_slice_only (bool, optional): If set the output will be of shape ``[x.size(0), x.size(1), 1]``
             (default ``False``).
-        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
     Returns:
         tuple[BlksprsTensor, Tensor]: A tuple containing a block-sparse tensor in compressed form containing the row-wise max
             of the input and the sparsity layout of the output tensor.
     """
+    # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376, should be fixed with the upcoming 3.4.0 release
+    x = torch.where(x == -0.0, torch.tensor(0.0), x)
     x = x.contiguous()
     validate_dimensions(x)
@@ -166,7 +183,6 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
     validate_device(x)
     validate_sparsity(sparsity_block_size, (x, sparsity_layout))
     validate_sparsity_block_size(sparsity_block_size, x)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
     sparsity_lut = torch.nonzero(sparsity_layout).contiguous()
@@ -181,50 +197,67 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
     validate_contiguous(sparsity_layout, sparsity_lut,
                         sparsity_layout_output, sparsity_reverse_lut_output)
-    output = torch.full(size=(n_sparse_blocks_output,
-                              sparsity_block_size,
-                              1 if flag_slice_only else sparsity_block_size),
-                        fill_value=float("-inf"),
-                        device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_x_r, s_lut_x_c = sparsity_lut.size()
-    s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
-    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_blocksparse_row_wise_max[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
-      output,
-      o_b, o_b_s, o_r_s,
-      s_l_o_b, s_l_o_b_s, s_l_o_r_s,
-      sparsity_reverse_lut_output,
-      triton_block_size))
-    return BlksprsTensor(output), sparsity_layout_output
+    return BlksprsTensor(
+        row_wise_max_forward(x, sparsity_lut, sparsity_layout_output, sparsity_reverse_lut_output, sparsity_block_size,
+                             n_sparse_blocks_output, flag_slice_only)), sparsity_layout_output
+@triton_op("blksprs::row_wise_max_forward", mutates_args={})
+def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
+                         sparsity_layout_output: Tensor, sparsity_reverse_lut_output: Tensor,
+                         sparsity_block_size: int, n_sparse_blocks_output: int,
+                         flag_slice_only: bool = False) -> Tensor:
+    with torch.no_grad():
+        output = torch.full(size=(n_sparse_blocks_output,
+                                  sparsity_block_size,
+                                  1 if flag_slice_only else sparsity_block_size),
+                            fill_value=torch.finfo(x.dtype).min,
+                            device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_lut_x_r, s_lut_x_c = sparsity_lut.size()
+        s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
+        s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
+        triton_grid = lambda meta: [x_b,
+                                    triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(row_wise_max_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          output,
+          o_b, o_b_s, o_r_s,
+          s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+          sparsity_reverse_lut_output,
+          sparsity_block_size))
+        return output
+# noinspection PyUnusedLocal
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
+    restore_value=["o"]
+)
 @triton.jit
-def kernel_blocksparse_row_wise_max(x,
-                                    x_b, x_b_s, x_r_s, x_c_s,
-                                    s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
-                                    o,
-                                    o_b, o_b_s, o_r_s,
-                                    s_l_o_b, s_l_o_b_s, s_l_o_r_s,
-                                    r_lut_o,
-                                    TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+def row_wise_max_kernel(x,
+                        x_b, x_b_s, x_r_s, x_c_s,
+                        s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+                        o,
+                        o_b, o_b_s, o_r_s,
+                        s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+                        r_lut_o,
+                        sparsity_block_size,
+                        TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+    # Get triton block indices
     pid_blk = tl.program_id(axis=0)
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
@@ -244,27 +277,27 @@ def kernel_blocksparse_row_wise_max(x,
     rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
-    if rev_idx_spa == -1:
-        tl.device_assert(False)
-        return
-    blk_idx = ((pid_blk * x_b_s) +
-               ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-               ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = (blk_idx >= 0 and blk_idx < x_b * x_b_s)
-    blk = tl.load(x + blk_idx, mask=blk_msk)
+    if rev_idx_spa >= 0:
+        blk_idx = ((pid_blk * x_b_s) +
+                   ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                   ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_msk = (blk_idx >= 0 and
+                   blk_idx < x_b * x_b_s)
+        blk = tl.load(x + blk_idx, mask=blk_msk)
-    buf = tl.reshape(tl.max(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
+        buf = tl.reshape(tl.max(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
-    o_idx = (rev_idx_spa * o_b_s +
-             ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-             (tl.arange(0, 1))[None, :])
-    o_msk = (o_idx >= 0 and o_idx < o_b * o_b_s)
-    tl.atomic_max(o + o_idx, buf, o_msk)
+        o_idx = (rev_idx_spa * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 (tl.arange(0, 1))[None, :])
+        o_msk = (o_idx >= 0 and
+                 o_idx < o_b * o_b_s)
+        tl.atomic_max(o + o_idx, buf, o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
-                 sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
+                 sparsity_block_size: int) -> BlksprsTensor:
     """For each row in ``y`` adds the value to each value in the corresponding row of the block-sparse tensor ``x``.
     Args:
@@ -272,7 +305,6 @@ def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
         sparsity_layout_x (Tensor): The sparsity layout of the block-sparse tensor.
         y (BlksprsTensor): A block-sparse tensor in compressed form with only one value per row and a single column of sparse blocks.
         sparsity_block_size (int): The size of the sparsity blocks.
-        triton_block_size (int): The block size to use for the triton kernel (default ``None``).
     Returns:
         BlksprsTensor: The values of ``x`` with the first value of ``y`` in each row added to them as a block-sparse tensor in
@@ -284,9 +316,8 @@ def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
     validate_device(x)
     validate_sparsity(sparsity_block_size, (x, sparsity_layout_x))
     validate_sparsity_block_size(sparsity_block_size, x)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
-    sparsity_lut = torch.nonzero(sparsity_layout_x).contiguous()
+    sparsity_lut_x = torch.nonzero(sparsity_layout_x).contiguous()
     sparsity_layout_rwm, _ = torch.max(sparsity_layout_x, dim=-1, keepdim=True)
     sparsity_layout_rwm_flat = sparsity_layout_rwm.reshape(-1)
@@ -294,60 +325,73 @@ def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
                                 (sparsity_layout_rwm_flat == 1) -
                                 (1 * (sparsity_layout_rwm_flat == 0)))
-    validate_contiguous(sparsity_layout_x, sparsity_lut, sparsity_reverse_lut_rwm)
-    output = torch.empty_like(x)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_r, s_lut_c = sparsity_lut.size()
-    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
-    y_b, y_r, y_c = y.size()
-    y_b_s, y_r_s, y_c_s = stride(y)
-    s_l_y_b, s_l_y_r, s_l_y_c = sparsity_layout_rwm.size()
-    s_l_y_b_s, s_l_y_r_s, s_l_y_c_s = stride(sparsity_layout_rwm)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_blocksparse_row_wise_add[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-      y, y_b, y_b_s, y_r_s, y_c_s,
-      s_l_y_b, s_l_y_b_s, s_l_y_r_s,
-      sparsity_reverse_lut_rwm,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      triton_block_size
-      ))
+    validate_contiguous(sparsity_layout_x, sparsity_lut_x, sparsity_reverse_lut_rwm)
-    return BlksprsTensor(output)
+    return BlksprsTensor(row_wise_add_forward(x, sparsity_lut_x, sparsity_layout_rwm,
+                                              sparsity_reverse_lut_rwm, y, sparsity_block_size))
 def row_wise_sub(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
-                 sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
+                 sparsity_block_size: int) -> BlksprsTensor:
     """Wrapper for ``row_wise_add`` with negated y.
     """
-    return row_wise_add(x, sparsity_layout_x, torch.neg(y), sparsity_block_size, triton_block_size)
+    return row_wise_add(x, sparsity_layout_x, torch.neg(y), sparsity_block_size)
+@triton_op("blksprs::row_wise_add_forward", mutates_args={})
+def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
+                         sparsity_layout_x_rwm: Tensor, sparsity_reverse_x_lut_rwm: Tensor,
+                         y: Tensor, sparsity_block_size: int) -> Tensor:
+    with torch.no_grad():
+        output = torch.zeros_like(x)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_lut_r, s_lut_c = sparsity_lut_x.size()
+        s_lut_r_s, s_lut_c_s = stride(sparsity_lut_x)
+        y_b, y_r, y_c = y.size()
+        y_b_s, y_r_s, y_c_s = stride(y)
+        s_l_y_b, s_l_y_r, s_l_y_c = sparsity_layout_x_rwm.size()
+        s_l_y_b_s, s_l_y_r_s, s_l_y_c_s = stride(sparsity_layout_x_rwm)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(kernel_blocksparse_row_wise_add)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut_x, s_lut_r, s_lut_r_s, s_lut_c_s,
+          y, y_b, y_b_s, y_r_s, y_c_s,
+          s_l_y_b, s_l_y_b_s, s_l_y_r_s,
+          sparsity_reverse_x_lut_rwm,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_block_size))
+        return output
+# noinspection PyUnusedLocal
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
+    reset_to_zero=["o"]
+)
 @triton.jit
 def kernel_blocksparse_row_wise_add(x,
                                     x_b, x_b_s, x_r_s, x_c_s,
-                                    s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                    s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
                                     y, y_b, y_b_s, y_r_s, y_c_s,
                                     s_l_y_b, s_l_y_b_s, s_l_y_r_s,
                                     r_lut_y,
                                     o,
                                     o_b, o_b_s, o_r_s, o_c_s,
+                                    sparsity_block_size,
                                     TRITON_BLOCK_SIZE: tl.constexpr) -> None:
     # Get triton block indices
     pid_blk = tl.program_id(axis=0)
@@ -355,13 +399,13 @@ def kernel_blocksparse_row_wise_add(x,
     pid_col = tl.program_id(axis=2)
     # Get position of current sparsity block consisting of its batch and row index
-    spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-    spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
-    spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
+    spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
+    spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
+    spa_bat = tl.load(s_lut_x + spa_bat_idx, mask=spa_bat_msk)
-    spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-    spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
-    spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
+    spa_row_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
+    spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_x_r * s_lut_x_r_s)
+    spa_row = tl.load(s_lut_x + spa_row_idx, mask=spa_row_msk)
     # Get reverse sparsity indices for s
     rev_idx_spa_s_idx = (spa_bat * s_l_y_b_s +
@@ -377,14 +421,16 @@ def kernel_blocksparse_row_wise_add(x,
     blk_x_idx = ((pid_blk * x_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
+    blk_x_msk = (blk_x_idx >= 0 and
+                 blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load sum block
     blk_s_idx = (rev_idx_spa_s * y_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
                  (tl.arange(0, 1) * y_c_s)[None, :])
-    blk_s_msk = (blk_s_idx >= 0 and blk_s_idx < y_b * y_b_s)
+    blk_s_msk = (blk_s_idx >= 0 and
+                 blk_s_idx < y_b * y_b_s)
     blk_s = tl.load(y + blk_s_idx, mask=blk_s_msk)
     # Compute exp
@@ -394,5 +440,6 @@ def kernel_blocksparse_row_wise_add(x,
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
+    blk_o_msk = (blk_o_idx >= 0 and
+                 blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs 1.11__py3-none-any.whl → 2.0__py3-none-any.whl

blksprs 1.11py3-none-any.whl → 2.0py3-none-any.whl