PyPI - blksprs - Versions diffs - 2.0rc4__py3-none-any.whl → 2.0rc7__py3-none-any.whl - Mend

blksprs 2.0rc4py3-none-any.whl → 2.0rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

blksprs/layouting/distribution_layout.py +11 -15
blksprs/layouting/sparsity_layout.py +26 -31
blksprs/ops/conversion.py +45 -63
blksprs/ops/distribution.py +38 -57
blksprs/ops/flow.py +22 -33
blksprs/ops/matmul.py +19 -20
blksprs/ops/misc/broadcast_ops.py +15 -19
blksprs/ops/misc/row_wise.py +39 -54
blksprs/ops/softmax.py +30 -44
blksprs/utils/autotuning.py +78 -0
blksprs/utils/tools.py +0 -28
blksprs/utils/validation.py +3 -0
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/METADATA +18 -5
blksprs-2.0rc7.dist-info/RECORD +23 -0
blksprs-2.0rc4.dist-info/RECORD +0 -22
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/WHEEL +0 -0
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/top_level.txt +0 -0

blksprs/ops/flow.py CHANGED Viewed

@@ -5,7 +5,8 @@ from torch._library import triton_op
 from torch._library.triton import wrap_triton
 from triton import language as tl
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 @triton_op("blksprs::flow_pull", mutates_args={})
@@ -43,7 +44,8 @@ def flow_pull_forward(x: Tensor, sparsity_layout_o: Tensor,
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -61,9 +63,6 @@ def flow_pull_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get sparsity index of current output block consisting of its batch, row, and column index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
@@ -86,21 +85,17 @@ def flow_pull_kernel(x,
     if rev_idx_spa >= 0:
         blk_x_idx = (rev_idx_spa * x_b_s +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = ((blk_x_idx >= 0 and
-                      blk_x_idx < x_b * x_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx >= 0 and
+                     blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         blk_o_idx = (pid_blk * o_b_s +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = ((blk_o_idx >= 0 and
-                      blk_o_idx < o_b * o_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+        blk_o_msk = (blk_o_idx >= 0 and
+                     blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
@@ -138,7 +133,8 @@ def flow_push_forward(x: Tensor, sparsity_layout_x: Tensor, sparsity_lut: Tensor
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -156,9 +152,6 @@ def flow_push_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get sparsity index of current input block consisting of its batch, row, and column index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
@@ -181,19 +174,15 @@ def flow_push_kernel(x,
     if rev_idx_spa >= 0:
         blk_x_idx = (pid_blk * x_b_s +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = ((blk_x_idx >= 0 and
-                      blk_x_idx < x_b * x_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx >= 0 and
+                     blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         blk_o_idx = (rev_idx_spa * o_b_s +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = ((blk_o_idx >= 0 and
-                      blk_o_idx < o_b * o_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+        blk_o_msk = (blk_o_idx >= 0 and
+                     blk_o_idx < o_b * o_b_s)
         tl.atomic_add(o + blk_o_idx, blk_x, mask=blk_o_msk)

blksprs/ops/matmul.py CHANGED Viewed

@@ -6,7 +6,8 @@ from triton import language as tl
 from blksprs.ops.transpose import transpose
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_dtype_float
@@ -117,7 +118,8 @@ def matmul_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -141,9 +143,6 @@ def matmul_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
     spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
@@ -161,11 +160,11 @@ def matmul_kernel(x,
     buf = tl.zeros(shape=(TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), dtype=tl.float32)
     # Slide over triton block sized segments of input tensors
-    for i_seg_tri in range(0, tl.cdiv(s_l_x_c * sparsity_block_size, val_tbs)):
+    for i_seg_tri in range(0, tl.cdiv(s_l_x_c * sparsity_block_size, TRITON_BLOCK_SIZE)):
         # Convert to segment index of sparsity layout
-        i_seg_spa = (i_seg_tri * val_tbs) // sparsity_block_size
+        i_seg_spa = (i_seg_tri * TRITON_BLOCK_SIZE) // sparsity_block_size
         # Calculate the triton segment index within a block
-        i_seg_tri_mod = i_seg_tri % (sparsity_block_size // val_tbs)
+        i_seg_tri_mod = i_seg_tri % (sparsity_block_size // TRITON_BLOCK_SIZE)
         # Get reverse sparsity indices for input tensors x and y
         # These are either -1 if the block is empty or equal to the index of the block in the sparse tensor
@@ -185,23 +184,23 @@ def matmul_kernel(x,
         # If both blocks are present commence calculation
         if rev_idx_spa_x >= 0 and rev_idx_spa_y >= 0:
             blk_x_idx = ((rev_idx_spa_x * x_b_s) +
-                         ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                         ((i_seg_tri_mod * val_tbs +
+                         ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                         ((i_seg_tri_mod * TRITON_BLOCK_SIZE +
                            tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
             blk_x_msk = ((blk_x_idx >= 0 and
                           blk_x_idx < x_b * x_b_s) and
-                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < TRITON_BLOCK_SIZE and
+                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < TRITON_BLOCK_SIZE))
             blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
             blk_y_idx = ((rev_idx_spa_y * y_b_s) +
-                         ((i_seg_tri_mod * val_tbs +
+                         ((i_seg_tri_mod * TRITON_BLOCK_SIZE +
                            tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
-                         ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * y_c_s)[None, :])
+                         ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * y_c_s)[None, :])
             blk_y_msk = ((blk_y_idx >= 0 and
                           blk_y_idx < y_b * y_b_s) and
-                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < TRITON_BLOCK_SIZE and
+                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < TRITON_BLOCK_SIZE))
             blk_y = tl.load(y + blk_y_idx, mask=blk_y_msk)
             # Perform matrix multiplication
@@ -212,12 +211,12 @@ def matmul_kernel(x,
     # Store output
     blk_o_idx = ((pid_blk * o_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
     blk_o_msk = ((blk_o_idx >= 0 and
                   blk_o_idx < o_b * o_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < TRITON_BLOCK_SIZE and
+                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < TRITON_BLOCK_SIZE))
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs/ops/misc/broadcast_ops.py CHANGED Viewed

@@ -6,11 +6,13 @@ from torch._library.triton import wrap_triton
 from triton import language as tl
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_device, \
     validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
                   sparsity_block_size: int) -> BlksprsTensor:
     """Performs a broadcast and subsequent addition of two dense tensors x and y. Returns a block-sparse tensor in
@@ -87,7 +89,8 @@ def broadcast_add_forward(x: Tensor, y: Tensor,
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -105,9 +108,6 @@ def broadcast_add_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
     spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
@@ -123,20 +123,18 @@ def broadcast_add_kernel(x,
     # Load x block
     blk_x_idx = (spa_bat_o * x_b_s +
-                 ((pid_row * val_tbs + spa_row_o * sparsity_block_size +
+                 ((pid_row * TRITON_BLOCK_SIZE + spa_row_o * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = ((blk_x_idx >= 0 and
-                  blk_x_idx < x_b * x_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    blk_x_msk = (blk_x_idx >= 0 and
+                 blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load y block
     blk_y_idx = (spa_bat_o * y_b_s +
-                 ((pid_col * val_tbs + spa_col_o * sparsity_block_size +
+                 ((pid_col * TRITON_BLOCK_SIZE + spa_col_o * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * y_c_s)[None, :])
-    blk_y_msk = ((blk_y_idx >= 0 and
-                  blk_y_idx < y_b * y_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    blk_y_msk = (blk_y_idx >= 0 and
+                 blk_y_idx < y_b * y_b_s)
     blk_y = tl.load(y + blk_y_idx, mask=blk_y_msk)
     # Compute sum
@@ -145,10 +143,8 @@ def broadcast_add_kernel(x,
     # Store result
     blk_o_idx = ((pid_blk * o_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = ((blk_o_idx >= 0 and
-                  blk_o_idx < o_b * o_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+    blk_o_msk = (blk_o_idx >= 0 and
+                 blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs/ops/misc/row_wise.py CHANGED Viewed

@@ -4,8 +4,9 @@ from torch import Tensor
 from torch._library.triton import wrap_triton, triton_op
 from triton import language as tl
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
 from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, validate_sparsity, \
     validate_sparsity_block_size
@@ -94,9 +95,11 @@ def row_wise_sum_forward(x: Tensor, sparsity_lut: Tensor,
     return output
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -114,9 +117,6 @@ def row_wise_sum_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
@@ -137,23 +137,19 @@ def row_wise_sum_kernel(x,
         return
     blk_idx = ((pid_blk * x_b_s) +
-               ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-               ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = ((blk_idx >= 0 and
-                blk_idx < x_b * x_b_s) and
-               (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+               ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+               ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_msk = (blk_idx >= 0 and
+               blk_idx < x_b * x_b_s)
     blk = tl.load(x + blk_idx, mask=blk_msk)
     buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
     o_idx = (rev_idx_spa * o_b_s +
-             ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+             ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
              (tl.arange(0, 1))[None, :])
-    o_msk = ((o_idx >= 0 and
-              o_idx < o_b * o_b_s) and
-             (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-              tl.arange(0, 1)[None, :] < val_tbs))
+    o_msk = (o_idx >= 0 and
+             o_idx < o_b * o_b_s)
     tl.atomic_add(o + o_idx, buf, o_msk)
@@ -180,6 +176,8 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
             of the input and the sparsity layout of the output tensor.
     """
+    # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376
+    x = torch.where(x == -0.0, torch.tensor(0.0), x)
     x = x.contiguous()
     validate_dimensions(x)
@@ -214,7 +212,7 @@ def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
     output = torch.full(size=(n_sparse_blocks_output,
                               sparsity_block_size,
                               1 if flag_slice_only else sparsity_block_size),
-                        fill_value=float("-inf"),
+                        fill_value=torch.finfo(x.dtype).min,
                         device=x.device)
     x_b, x_r, x_c = x.size()
@@ -243,9 +241,11 @@ def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
     return output
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     restore_value=["o"]
 )
 @triton.jit
@@ -263,9 +263,6 @@ def row_wise_max_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
@@ -286,23 +283,19 @@ def row_wise_max_kernel(x,
         return
     blk_idx = ((pid_blk * x_b_s) +
-               ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-               ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = ((blk_idx >= 0 and
-                blk_idx < x_b * x_b_s) and
-               (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+               ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+               ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_msk = (blk_idx >= 0 and
+               blk_idx < x_b * x_b_s)
     blk = tl.load(x + blk_idx, mask=blk_msk)
     buf = tl.reshape(tl.max(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
     o_idx = (rev_idx_spa * o_b_s +
-             ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+             ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
              (tl.arange(0, 1))[None, :])
-    o_msk = ((o_idx >= 0 and
-              o_idx < o_b * o_b_s) and
-             (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-              tl.arange(0, 1)[None, :] < val_tbs))
+    o_msk = (o_idx >= 0 and
+             o_idx < o_b * o_b_s)
     tl.atomic_max(o + o_idx, buf, o_msk)
@@ -371,7 +364,7 @@ def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
                                 triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
                                 triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_blocksparse_row_wise_add[triton_grid]
+    (wrap_triton(kernel_blocksparse_row_wise_add)[triton_grid]
      (x,
       x_b, x_b_s, x_r_s, x_c_s,
       sparsity_lut_x, s_lut_r, s_lut_r_s, s_lut_c_s,
@@ -387,7 +380,8 @@ def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -406,9 +400,6 @@ def kernel_blocksparse_row_wise_add(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
@@ -430,22 +421,18 @@ def kernel_blocksparse_row_wise_add(x,
     # Load x block
     blk_x_idx = ((pid_blk * x_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = ((blk_x_idx >= 0 and
-                  blk_x_idx < x_b * x_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx >= 0 and
+                 blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load sum block
     blk_s_idx = (rev_idx_spa_s * y_b_s +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
                  (tl.arange(0, 1) * y_c_s)[None, :])
-    blk_s_msk = ((blk_s_idx >= 0 and
-                  blk_s_idx < y_b * y_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, 1)[None, :] < val_tbs))
+    blk_s_msk = (blk_s_idx >= 0 and
+                 blk_s_idx < y_b * y_b_s)
     blk_s = tl.load(y + blk_s_idx, mask=blk_s_msk)
     # Compute exp
@@ -453,10 +440,8 @@ def kernel_blocksparse_row_wise_add(x,
     # Store block
     blk_o_idx = ((pid_blk * o_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = ((blk_o_idx >= 0 and
-                  blk_o_idx < o_b * o_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+    blk_o_msk = (blk_o_idx >= 0 and
+                 blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs/ops/softmax.py CHANGED Viewed

@@ -7,7 +7,8 @@ from triton import language as tl
 from blksprs.ops.misc.row_wise import row_wise_sum, row_wise_max, row_wise_sub
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32
@@ -114,7 +115,8 @@ def softmax_backward(ctx, grad_output):
                                 triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
                                 triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(softmax_kernel_grad)[triton_grid]
+    # TODO wrap
+    (softmax_kernel_grad[triton_grid]
      (grad_output,
       o_b, o_b_s, o_r_s, o_c_s,
       o,
@@ -133,7 +135,8 @@ def softmax_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -151,9 +154,6 @@ def softmax_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
@@ -172,22 +172,18 @@ def softmax_kernel(x,
     if rev_idx_spa_s >= 0:
         # Load x block
         blk_x_idx = ((pid_blk * x_b_s) +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = ((blk_x_idx >= 0 and
-                      blk_x_idx < x_b * x_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx >= 0 and
+                     blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Load sum block
         blk_s_idx = (rev_idx_spa_s * s_b_s +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = ((blk_s_idx >= 0 and
-                      blk_s_idx < s_b * s_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, 1)[None, :] < val_tbs))
+        blk_s_msk = (blk_s_idx >= 0 and
+                     blk_s_idx < s_b * s_b_s)
         blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
         # Compute softmax
@@ -199,7 +195,8 @@ def softmax_kernel(x,
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -221,9 +218,6 @@ def softmax_kernel_grad(g,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch and row index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
@@ -240,41 +234,33 @@ def softmax_kernel_grad(g,
     if rev_idx_spa_s >= 0:
         blk_s_idx = (rev_idx_spa_s * s_b_s +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = ((blk_s_idx >= 0 and
-                      blk_s_idx < s_b * s_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, 1)[None, :] < val_tbs))
+        blk_s_msk = (blk_s_idx >= 0 and
+                     blk_s_idx < s_b * s_b_s)
         blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
         blk_g_idx = ((pid_blk * g_b_s) +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * g_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * g_c_s)[None, :])
-        blk_g_msk = ((blk_g_idx >= 0 and
-                      blk_g_idx < g_b * g_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_c_s)[None, :])
+        blk_g_msk = (blk_g_idx >= 0 and
+                     blk_g_idx < g_b * g_b_s)
         blk_g = tl.load(g + blk_g_idx, mask=blk_g_msk)
         blk_x_idx = ((pid_blk * x_b_s) +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = ((blk_x_idx >= 0 and
-                      blk_x_idx < x_b * x_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx >= 0 and
+                     blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         buf = blk_x * (blk_g - blk_s)
         blk_o_idx = ((pid_blk * o_b_s) +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = ((blk_o_idx >= 0 and
-                      blk_o_idx < o_b * o_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+        blk_o_msk = (blk_o_idx >= 0 and
+                     blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs 2.0rc4__py3-none-any.whl → 2.0rc7__py3-none-any.whl

blksprs 2.0rc4py3-none-any.whl → 2.0rc7py3-none-any.whl