PyPI - blksprs - Versions diffs - 2.0rc4__py3-none-any.whl → 2.0rc7__py3-none-any.whl - Mend

blksprs 2.0rc4py3-none-any.whl → 2.0rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

blksprs/layouting/distribution_layout.py +11 -15
blksprs/layouting/sparsity_layout.py +26 -31
blksprs/ops/conversion.py +45 -63
blksprs/ops/distribution.py +38 -57
blksprs/ops/flow.py +22 -33
blksprs/ops/matmul.py +19 -20
blksprs/ops/misc/broadcast_ops.py +15 -19
blksprs/ops/misc/row_wise.py +39 -54
blksprs/ops/softmax.py +30 -44
blksprs/utils/autotuning.py +78 -0
blksprs/utils/tools.py +0 -28
blksprs/utils/validation.py +3 -0
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/METADATA +18 -5
blksprs-2.0rc7.dist-info/RECORD +23 -0
blksprs-2.0rc4.dist-info/RECORD +0 -22
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/WHEEL +0 -0
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/top_level.txt +0 -0

blksprs/layouting/distribution_layout.py CHANGED Viewed

@@ -4,7 +4,8 @@ from torch import Tensor
 from triton import language as tl
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous
@@ -47,6 +48,7 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
                                 triton.cdiv(i_r, meta["TRITON_BLOCK_SIZE"]),
                                 triton.cdiv(i_c, meta["TRITON_BLOCK_SIZE"])]
+    # TODO wrap
     (build_distribution_layout_kernel[triton_grid]
      (indices,
       i_b, i_b_s, i_r_s, i_c_s,
@@ -62,7 +64,8 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -80,9 +83,6 @@ def build_distribution_layout_kernel(i,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_i_idx = (pid_blk * s_lut_i_r_s + 0 * s_lut_i_c_s)
     spa_bat_i_msk = (spa_bat_i_idx >= 0 and spa_bat_i_idx < s_lut_i_r * s_lut_i_r_s)
@@ -97,12 +97,10 @@ def build_distribution_layout_kernel(i,
     spa_col_i = tl.load(s_lut_i + spa_col_i_idx, mask=spa_col_i_msk)
     blk_i_idx = (pid_blk * i_b_s +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
-    blk_i_msk = ((blk_i_idx >= 0 and
-                  blk_i_idx < i_b * i_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
+    blk_i_msk = (blk_i_idx >= 0 and
+                 blk_i_idx < i_b * i_b_s)
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk)
     dst_bat_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_i, dtype=tl.int32)
@@ -120,8 +118,6 @@ def build_distribution_layout_kernel(i,
     blk_o_idx = ((dst_bat_idx * o_b_s) +
                  (dst_row_idx * o_r_s) +
                  (dst_col_idx * o_c_s))
-    blk_o_msk = ((blk_o_idx >= 0 and
-                  blk_o_idx < o_b * o_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    blk_o_msk = (blk_o_idx >= 0 and
+                 blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, blk_v, mask=blk_o_msk)

blksprs/layouting/sparsity_layout.py CHANGED Viewed

@@ -7,7 +7,8 @@ from torch._library.triton import wrap_triton
 from triton import language as tl
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
 from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous, validate_sparsity, validate_sparsity_block_size
@@ -37,10 +38,11 @@ def build_sparsity_layout(x: Tensor, sparsity_block_size: int) -> Tensor:
     o_b_s, o_r_s, o_c_s = stride(output)
     triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, min(meta["sparsity_block_size"], meta["TRITON_BLOCK_SIZE"])),
-                                triton.cdiv(x_c, min(meta["sparsity_block_size"], meta["TRITON_BLOCK_SIZE"]))]
+                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(build_sparsity_layout_kernel)[triton_grid]
+    # TODO wrap
+    (build_sparsity_layout_kernel[triton_grid]
      (x,
       x_b, x_b_s, x_r_s, x_c_s,
       output,
@@ -52,7 +54,8 @@ def build_sparsity_layout(x: Tensor, sparsity_block_size: int) -> Tensor:
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -67,24 +70,19 @@ def build_sparsity_layout_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Load x values
     blk_x_idx = (pid_bat * x_b_s +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = ((blk_x_idx >= 0 and
-                  blk_x_idx < x_b * x_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx >= 0 and
+                 blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store sparsity layout value
     if tl.min(blk_x) != 0 or tl.max(blk_x) != 0:
         blk_o_idx = (pid_bat * o_b_s +
-                     (((pid_row * val_tbs) // sparsity_block_size) * o_r_s +
-                      ((pid_col * val_tbs) // sparsity_block_size) * o_c_s))
+                     (((pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_r_s +
+                      ((pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_c_s))
         blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)
@@ -129,10 +127,11 @@ def build_sparsity_layout_adaption(x: BlksprsTensor, sparsity_layout_from: Tenso
     o_b_s, o_r_s, o_c_s = stride(output)
     triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, min(meta["sparsity_block_size_to"], meta["TRITON_BLOCK_SIZE"])),
-                                triton.cdiv(x_c, min(meta["sparsity_block_size_to"], meta["TRITON_BLOCK_SIZE"]))]
+                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(build_sparsity_layout_adaption_kernel)[triton_grid]
+    # TODO wrap
+    (build_sparsity_layout_adaption_kernel[triton_grid]
      (x,
       x_b, x_b_s, x_r_s, x_c_s,
       sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
@@ -146,7 +145,8 @@ def build_sparsity_layout_adaption(x: BlksprsTensor, sparsity_layout_from: Tenso
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size_from", "sparsity_block_size_to"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs_conversion},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -163,9 +163,6 @@ def build_sparsity_layout_adaption_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size_to, TRITON_BLOCK_SIZE)
     # Get sparsity index of current output block consisting of its batch, row, and column index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
@@ -181,20 +178,18 @@ def build_sparsity_layout_adaption_kernel(x,
     # Load x values
     blk_x_idx = ((pid_blk * x_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = ((blk_x_idx >= 0 and
-                  blk_x_idx < x_b * x_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < sparsity_block_size_from and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < sparsity_block_size_from))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx >= 0 and
+                 blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store sparsity layout value
     if tl.min(blk_x) != 0 or tl.max(blk_x) != 0:
         blk_o_idx = ((spa_bat * o_b_s) +
-                     (((pid_row * val_tbs + spa_row * sparsity_block_size_from)
+                     (((pid_row * TRITON_BLOCK_SIZE + spa_row * sparsity_block_size_from)
                        // sparsity_block_size_to) * o_r_s) +
-                     (((pid_col * val_tbs + spa_col * sparsity_block_size_from)
+                     (((pid_col * TRITON_BLOCK_SIZE + spa_col * sparsity_block_size_from)
                        // sparsity_block_size_to) * o_c_s))
         blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)

blksprs/ops/conversion.py CHANGED Viewed

@@ -6,7 +6,8 @@ from triton import language as tl
 from blksprs.layouting.sparsity_layout import build_sparsity_layout_adaption
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_sparsity_dense
@@ -86,7 +87,8 @@ def to_sparse_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -102,9 +104,6 @@ def to_sparse_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get sparsity index of current output block consisting of its batch, row, and column index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
@@ -120,24 +119,20 @@ def to_sparse_kernel(x,
     # Load block from dense tensor
     blk_d_idx = (spa_bat * x_b_s +
-                 ((pid_row * val_tbs + spa_row * sparsity_block_size +
+                 ((pid_row * TRITON_BLOCK_SIZE + spa_row * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * val_tbs + spa_col * sparsity_block_size +
+                 ((pid_col * TRITON_BLOCK_SIZE + spa_col * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_d_msk = ((blk_d_idx >= 0 and
-                  blk_d_idx < x_b * x_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    blk_d_msk = (blk_d_idx >= 0 and
+                 blk_d_idx < x_b * x_b_s)
     blk_d = tl.load(x + blk_d_idx, mask=blk_d_msk)
     # Store block in sparse tensor
     blk_o_idx = ((pid_blk * o_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE) * o_c_s))[None, :])
-    blk_o_msk = ((blk_o_idx >= 0 and
-                  blk_o_idx < (pid_blk + 1) * o_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE) * o_c_s))[None, :])
+    blk_o_msk = (blk_o_idx >= 0 and
+                 blk_o_idx < (pid_blk + 1) * o_b_s)
     tl.store(o + blk_o_idx, blk_d, mask=blk_o_msk)
@@ -228,8 +223,8 @@ def to_dense_forward(x: Tensor, sparsity_layout: Tensor,
     o_b_s, o_r_s, o_c_s = stride(output)
     triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, min(meta["sparsity_block_size"], meta["TRITON_BLOCK_SIZE"])),
-                                triton.cdiv(o_c, min(meta["sparsity_block_size"], meta["TRITON_BLOCK_SIZE"]))]
+                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
     (wrap_triton(to_dense_kernel)[triton_grid]
      (x,
@@ -252,7 +247,8 @@ def to_dense_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     restore_value=["o"]
 )
 @triton.jit
@@ -269,12 +265,9 @@ def to_dense_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get sparsity index of current block
-    spa_row = (pid_row * val_tbs) // sparsity_block_size
-    spa_col = (pid_col * val_tbs) // sparsity_block_size
+    spa_row = (pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size
+    spa_col = (pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size
     # Get reverse sparsity index for current block
     rev_idx_spa_idx = (pid_blk * s_l_b_s + spa_row * s_l_r_s + spa_col * s_l_c_s)
@@ -284,22 +277,18 @@ def to_dense_kernel(x,
     # If block is present commence operations
     if rev_idx_spa >= 0:
         blk_idx = (rev_idx_spa * x_b_s +
-                   (((pid_row % (sparsity_block_size // val_tbs)) * val_tbs +
+                   (((pid_row % (sparsity_block_size // TRITON_BLOCK_SIZE)) * TRITON_BLOCK_SIZE +
                      tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                   (((pid_col % (sparsity_block_size // val_tbs)) * val_tbs +
+                   (((pid_col % (sparsity_block_size // TRITON_BLOCK_SIZE)) * TRITON_BLOCK_SIZE +
                      tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_msk = ((blk_idx >= 0 and
-                    blk_idx < x_b * x_b_s) and
-                   (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                    tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+        blk_msk = (blk_idx >= 0 and
+                   blk_idx < x_b * x_b_s)
         blk = tl.load(x + blk_idx, mask=blk_msk)
         o_idx = (pid_blk * o_b_s +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        o_msk = ((o_idx >= 0 and o_idx < o_b * o_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+        o_msk = (o_idx >= 0 and o_idx < o_b * o_b_s)
         tl.store(o + o_idx, blk, o_msk)
@@ -403,12 +392,11 @@ def adapt_layout_forward(x: Tensor,
     s_lut_o_r_s, s_lut_o_c_s = stride(sparsity_lut_to)
     triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, min(meta["sparsity_block_size_from"], meta["sparsity_block_size_to"],
-                                                     meta["TRITON_BLOCK_SIZE"])),
-                                triton.cdiv(o_c, min(meta["sparsity_block_size_from"], meta["sparsity_block_size_to"],
-                                                     meta["TRITON_BLOCK_SIZE"]))]
+                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(adapt_layout_kernel)[triton_grid]
+    # TODO wrap
+    (adapt_layout_kernel[triton_grid]
      (x,
       x_b, x_b_s, x_r_s, x_c_s,
       s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
@@ -434,7 +422,8 @@ def adapt_layout_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size_from", "sparsity_block_size_to"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs_conversion},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -453,9 +442,6 @@ def adapt_layout_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size (Triton can only handle 2-valued min)
-    val_tbs = min(min(sparsity_block_size_from, sparsity_block_size_to), TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
     spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
@@ -471,8 +457,8 @@ def adapt_layout_kernel(x,
     # Get equivalent sparsity block in from layout
     spa_bat_x = spa_bat_o
-    spa_row_x = (spa_row_o * sparsity_block_size_to + pid_row * val_tbs) // sparsity_block_size_from
-    spa_col_x = (spa_col_o * sparsity_block_size_to + pid_col * val_tbs) // sparsity_block_size_from
+    spa_row_x = (spa_row_o * sparsity_block_size_to + pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size_from
+    spa_col_x = (spa_col_o * sparsity_block_size_to + pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size_from
     # Get reverse sparsity indices for x
     rev_idx_spa_x_idx = (spa_bat_x * s_l_x_b_s +
@@ -484,29 +470,25 @@ def adapt_layout_kernel(x,
     # If block is present commence operations
     if rev_idx_spa_x >= 0:
         # Calculate triton block size shifts
-        shift_row_x = ((spa_row_o * sparsity_block_size_to + pid_row * val_tbs)
-                       % sparsity_block_size_from) // val_tbs
-        shift_col_x = ((spa_col_o * sparsity_block_size_to + pid_col * val_tbs)
-                       % sparsity_block_size_from) // val_tbs
+        shift_row_x = ((spa_row_o * sparsity_block_size_to + pid_row * TRITON_BLOCK_SIZE)
+                       % sparsity_block_size_from) // TRITON_BLOCK_SIZE
+        shift_col_x = ((spa_col_o * sparsity_block_size_to + pid_col * TRITON_BLOCK_SIZE)
+                       % sparsity_block_size_from) // TRITON_BLOCK_SIZE
         # Load x values
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
-                     ((shift_row_x * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                     ((shift_col_x * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = ((blk_x_idx >= 0 and
-                      blk_x_idx < x_b * x_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((shift_row_x * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                     ((shift_col_x * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_x_msk = (blk_x_idx >= 0 and
+                     blk_x_idx < x_b * x_b_s)
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Store output
         blk_o_idx = ((pid_blk * o_b_s) +
-                     ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                     ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = ((blk_o_idx >= 0 and
-                      blk_o_idx < o_b * o_b_s) and
-                     (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                      tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                     ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                     ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+        blk_o_msk = (blk_o_idx >= 0 and
+                     blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

blksprs/ops/distribution.py CHANGED Viewed

@@ -6,7 +6,8 @@ from torch._library.triton import wrap_triton
 from triton import language as tl
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_dtype_int, validate_sparsity_block_size
@@ -100,7 +101,8 @@ def gather_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -121,9 +123,6 @@ def gather_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
     spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
@@ -139,12 +138,10 @@ def gather_kernel(x,
     # Load index values
     blk_i_idx = ((pid_blk * i_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
-    blk_i_msk = ((blk_i_idx >= 0 and
-                  blk_i_idx < i_b * i_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
+    blk_i_msk = (blk_i_idx >= 0 and
+                 blk_i_idx < i_b * i_b_s)
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
     # Get indices of sparsity blocks and positions within the blocks
@@ -154,9 +151,9 @@ def gather_kernel(x,
     rev_dst_bat_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_o, dtype=tl.int32)
     rev_dst_row_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_o, dtype=tl.int32)
     rev_dst_col_x = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_o, dtype=tl.int32)
-    dst_row_x = (((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+    dst_row_x = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
                  .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
-    dst_col_x = (((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+    dst_col_x = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
                  .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
     if dim == 0:
         rev_dst_bat_x = blk_i
@@ -171,32 +168,26 @@ def gather_kernel(x,
     rev_idx_spa_x_idx = ((rev_dst_bat_x * s_l_x_b_s) +
                          (rev_dst_row_x * s_l_x_r_s) +
                          (rev_dst_col_x * s_l_x_c_s))
-    rev_idx_spa_x_msk = ((rev_idx_spa_x_idx >= 0 and
-                          rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s) and
-                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    rev_idx_spa_x_msk = (rev_idx_spa_x_idx >= 0 and
+                         rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
     rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
     # Load x values
     blk_x_idx = ((rev_idx_spa_x * x_b_s) +
                  dst_row_x +
                  dst_col_x)
-    blk_x_msk = (((blk_x_idx >= 0 and
-                   blk_x_idx < x_b * x_b_s) and
-                  rev_idx_spa_x_msk != -1) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    blk_x_msk = ((blk_x_idx >= 0 and
+                  blk_x_idx < x_b * x_b_s) and
+                 rev_idx_spa_x_msk != -1)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store output
     blk_o_idx = ((pid_blk * o_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (((blk_o_idx >= 0 and
-                   blk_o_idx < o_b * o_b_s) and
-                  rev_idx_spa_x_msk != -1) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
+    blk_o_msk = ((blk_o_idx >= 0 and
+                  blk_o_idx < o_b * o_b_s) and
+                 rev_idx_spa_x_msk != -1)
     tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
@@ -249,7 +240,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
                           reduce_op="none", lut=lut)
-@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
                    dim: int,
                    idx: BlksprsTensor,
@@ -357,7 +348,8 @@ def scatter_reduce_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[],
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
     reset_to_zero=["o"]
 )
 @triton.jit
@@ -379,9 +371,6 @@ def scatter_reduce_kernel(x,
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
-    # Get valid triton block size
-    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_x_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
     spa_bat_x_msk = (spa_bat_x_idx >= 0 and spa_bat_x_idx < s_lut_x_r * s_lut_x_r_s)
@@ -397,22 +386,18 @@ def scatter_reduce_kernel(x,
     # Load x values
     blk_x_idx = ((pid_blk * x_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = ((blk_x_idx >= 0 and
-                  blk_x_idx < x_b * x_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = (blk_x_idx >= 0 and
+                 blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load index values
     blk_i_idx = ((pid_blk * i_b_s) +
-                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
-                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
-    blk_i_msk = ((blk_i_idx >= 0 and
-                  blk_i_idx < i_b * i_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
+                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
+    blk_i_msk = (blk_i_idx >= 0 and
+                 blk_i_idx < i_b * i_b_s)
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
     # Get indices of sparsity blocks and positions within the blocks
@@ -422,9 +407,9 @@ def scatter_reduce_kernel(x,
     rev_dst_bat_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_x, dtype=tl.int32)
     rev_dst_row_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_row_x, dtype=tl.int32)
     rev_dst_col_o = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_col_x, dtype=tl.int32)
-    dst_row_o = (((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
+    dst_row_o = (((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None]
                  .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
-    dst_col_o = (((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
+    dst_col_o = (((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :]
                  .broadcast_to((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE)))
     if dim == 0:
         rev_dst_bat_o = blk_i
@@ -439,21 +424,17 @@ def scatter_reduce_kernel(x,
     rev_idx_spa_o_idx = ((rev_dst_bat_o * s_l_o_b_s) +
                          (rev_dst_row_o * s_l_o_r_s) +
                          (rev_dst_col_o * s_l_o_c_s))
-    rev_idx_spa_o_msk = ((rev_idx_spa_o_idx >= 0 and
-                          rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s) and
-                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    rev_idx_spa_o_msk = (rev_idx_spa_o_idx >= 0 and
+                         rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
     # Store output
     blk_o_idx = ((rev_idx_spa_o * o_b_s) +
                  dst_row_o +
                  dst_col_o)
-    blk_o_msk = (((blk_o_idx >= 0 and
-                   blk_o_idx < o_b * o_b_s) and
-                  rev_idx_spa_o_msk != -1) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
+    blk_o_msk = ((blk_o_idx >= 0 and
+                  blk_o_idx < o_b * o_b_s) and
+                 rev_idx_spa_o_msk != -1)
     if reduce_op_ind == 0:
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

blksprs 2.0rc4__py3-none-any.whl → 2.0rc7__py3-none-any.whl

blksprs 2.0rc4py3-none-any.whl → 2.0rc7py3-none-any.whl