PyPI - blksprs - Versions diffs - 1.11__py3-none-any.whl → 2.0rc1__py3-none-any.whl - Mend

blksprs 1.11py3-none-any.whl → 2.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

blksprs/__init__.py +2 -5
blksprs/layouting/distribution_layout.py +32 -25
blksprs/layouting/sparsity_layout.py +65 -52
blksprs/ops/conversion.py +421 -399
blksprs/ops/distribution.py +404 -366
blksprs/ops/flow.py +125 -106
blksprs/ops/matmul.py +220 -204
blksprs/ops/misc/broadcast_ops.py +53 -35
blksprs/ops/misc/row_wise.py +151 -91
blksprs/ops/partitioning.py +136 -132
blksprs/ops/repeat.py +115 -120
blksprs/ops/softmax.py +274 -246
blksprs/ops/transpose.py +52 -51
blksprs/utils/benchmarking.py +3 -3
blksprs/utils/tools.py +31 -4
blksprs/utils/validation.py +0 -14
{blksprs-1.11.dist-info → blksprs-2.0rc1.dist-info}/METADATA +42 -36
blksprs-2.0rc1.dist-info/RECORD +22 -0
{blksprs-1.11.dist-info → blksprs-2.0rc1.dist-info}/WHEEL +1 -1
blksprs/utils/layout_utils.py +0 -17
blksprs-1.11.dist-info/RECORD +0 -23
{blksprs-1.11.dist-info → blksprs-2.0rc1.dist-info}/top_level.txt +0 -0

blksprs/__init__.py CHANGED Viewed

@@ -18,19 +18,16 @@ class ops:
 class layouting:
     from blksprs.layouting.distribution_layout import build_distribution_layout
     from blksprs.layouting.sparsity_layout import build_sparsity_layout, build_sparsity_layout_adaption, \
-        build_sparsity_layout_matmul, build_sparsity_layout_matmul_fast
-    from blksprs.utils.layout_utils import build_full_sparsity_layout
+        build_sparsity_layout_matmul, build_sparsity_layout_matmul_fast, build_sparsity_layout_full
 class utils:
     from blksprs.utils.processing import apply_torch_linear, apply_torch_normalisation, apply_torch_dropout, \
         apply_function_applicable_row_wise
     from blksprs.utils.tools import do_shape_blocksparse, undo_shape_blocksparse
-    from blksprs.utils.validation import disable_validation
     class validation:
         from blksprs.utils.validation import disable_validation
         from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_dtype_float, \
             validate_dtype_int, validate_device, validate_sparsity, validate_sparsity_dense, \
-            validate_sparsity_block_size, \
-            validate_triton_block_size
+            validate_sparsity_block_size

blksprs/layouting/distribution_layout.py CHANGED Viewed

@@ -4,14 +4,14 @@ from torch import Tensor
 from triton import language as tl
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import get_triton_block_size, stride
-from blksprs.utils.validation import validate_triton_block_size, validate_dimensions, validate_device, \
+from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous
 def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: Tensor,
                               dim: int, size_target: torch.Size,
-                              sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
+                              sparsity_block_size: int) -> Tensor:
     """Builds the sparsity layout of either the source of a gather or the target of a scatter operation.
     Args:
@@ -20,7 +20,6 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
         dim (int): The dimension along which the operation is conducted.
         size_target (torch.Size): The size of the block-sparse target tensor in regular form.
         sparsity_block_size (int): The size of the sparsity blocks.
-        triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
     Returns:
         Tensor: The sparsity layout of the source or target tensor.
@@ -44,16 +43,11 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
     o_b, o_r, o_c = output.size()
     o_b_s, o_r_s, o_c_s = stride(output)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
     triton_grid = lambda meta: [i_b,
                                 triton.cdiv(i_r, meta["TRITON_BLOCK_SIZE"]),
                                 triton.cdiv(i_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_distribution_layout[triton_grid]
+    (build_distribution_layout_kernel[triton_grid]
      (indices,
       i_b, i_b_s, i_r_s, i_c_s,
       sparsity_lut_i,
@@ -61,27 +55,34 @@ def build_distribution_layout(indices: BlksprsTensor, sparsity_layout_indices: T
       adjusted_dim,
       output,
       o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_block_size,
-      triton_block_size))
+      sparsity_block_size))
     return output
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=[],
+    reset_to_zero=["o"]
+)
 @triton.jit
-def kernel_distribution_layout(i,
-                               i_b, i_b_s, i_r_s, i_c_s,
-                               s_lut_i,
-                               s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
-                               dim,
-                               o,
-                               o_b, o_b_s, o_r_s, o_c_s,
-                               sparsity_block_size,
-                               TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+def build_distribution_layout_kernel(i,
+                                     i_b, i_b_s, i_r_s, i_c_s,
+                                     s_lut_i,
+                                     s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+                                     dim,
+                                     o,
+                                     o_b, o_b_s, o_r_s, o_c_s,
+                                     sparsity_block_size,
+                                     TRITON_BLOCK_SIZE: tl.constexpr) -> None:
     # Get triton block indices
     pid_blk = tl.program_id(axis=0)
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
+    # Get valid triton block size
+    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_i_idx = (pid_blk * s_lut_i_r_s + 0 * s_lut_i_c_s)
     spa_bat_i_msk = (spa_bat_i_idx >= 0 and spa_bat_i_idx < s_lut_i_r * s_lut_i_r_s)
@@ -96,9 +97,12 @@ def kernel_distribution_layout(i,
     spa_col_i = tl.load(s_lut_i + spa_col_i_idx, mask=spa_col_i_msk)
     blk_i_idx = (pid_blk * i_b_s +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
-    blk_i_msk = (blk_i_idx >= 0 and blk_i_idx < i_b * i_b_s)
+                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
+                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
+    blk_i_msk = ((blk_i_idx >= 0 and
+                  blk_i_idx < i_b * i_b_s) and
+                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
+                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk)
     dst_bat_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_i, dtype=tl.int32)
@@ -116,5 +120,8 @@ def kernel_distribution_layout(i,
     blk_o_idx = ((dst_bat_idx * o_b_s) +
                  (dst_row_idx * o_r_s) +
                  (dst_col_idx * o_c_s))
-    blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
+    blk_o_msk = ((blk_o_idx >= 0 and
+                  blk_o_idx < o_b * o_b_s) and
+                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
+                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
     tl.store(o + blk_o_idx, blk_v, mask=blk_o_msk)

blksprs/layouting/sparsity_layout.py CHANGED Viewed

@@ -3,21 +3,21 @@ import math
 import torch
 import triton
 from torch import Tensor
+from torch._library.triton import wrap_triton
 from triton import language as tl
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import get_triton_block_size, stride
-from blksprs.utils.validation import validate_triton_block_size, validate_dimensions, validate_device, \
+from blksprs.utils.tools import stride, get_autotune_configs
+from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous, validate_sparsity, validate_sparsity_block_size
-def build_sparsity_layout(x: Tensor, sparsity_block_size: int, triton_block_size: int = None) -> Tensor:
+def build_sparsity_layout(x: Tensor, sparsity_block_size: int) -> Tensor:
     """Builds the sparsity layout of a dense tensor in regular form covering its sparse blocks.
     Args:
         x (Tensor): A block-sparse (or dense) tensor in regular form.
         sparsity_block_size (int): The size of the sparsity blocks.
-        triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
     Returns:
         Tensor: The sparsity layout of the input block-sparse (or dense) tensor.
@@ -35,57 +35,61 @@ def build_sparsity_layout(x: Tensor, sparsity_block_size: int, triton_block_size
     o_b, o_r, o_c = output.size()
     o_b_s, o_r_s, o_c_s = stride(output)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
     triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+                                triton.cdiv(x_r, min(meta["sparsity_block_size"], meta["TRITON_BLOCK_SIZE"])),
+                                triton.cdiv(x_c, min(meta["sparsity_block_size"], meta["TRITON_BLOCK_SIZE"]))]
-    (kernel_sparsity_layout[triton_grid]
+    (wrap_triton(build_sparsity_layout_kernel)[triton_grid]
      (x,
       x_b, x_b_s, x_r_s, x_c_s,
       output,
       o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_block_size,
-      triton_block_size))
+      sparsity_block_size))
     return output
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=[],
+    reset_to_zero=["o"]
+)
 @triton.jit
-def kernel_sparsity_layout(x,
-                           x_b, x_b_s, x_r_s, x_c_s,
-                           o,
-                           o_b, o_b_s, o_r_s, o_c_s,
-                           sparsity_block_size,
-                           TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+def build_sparsity_layout_kernel(x,
+                                 x_b, x_b_s, x_r_s, x_c_s,
+                                 o,
+                                 o_b, o_b_s, o_r_s, o_c_s,
+                                 sparsity_block_size,
+                                 TRITON_BLOCK_SIZE: tl.constexpr) -> None:
     # Get triton block indices
     pid_bat = tl.program_id(axis=0)
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
+    # Get valid triton block size
+    val_tbs = min(sparsity_block_size, TRITON_BLOCK_SIZE)
     # Load x values
     blk_x_idx = (pid_bat * x_b_s +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
+                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = ((blk_x_idx >= 0 and
+                  blk_x_idx < x_b * x_b_s) and
+                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < val_tbs and
+                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < val_tbs))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store sparsity layout value
     if tl.min(blk_x) != 0 or tl.max(blk_x) != 0:
         blk_o_idx = (pid_bat * o_b_s +
-                     (((pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_r_s +
-                      ((pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_c_s))
+                     (((pid_row * val_tbs) // sparsity_block_size) * o_r_s +
+                      ((pid_col * val_tbs) // sparsity_block_size) * o_c_s))
         blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)
 def build_sparsity_layout_adaption(x: BlksprsTensor, sparsity_layout_from: Tensor,
-                                   sparsity_block_size_from: int, sparsity_block_size_to: int,
-                                   triton_block_size: int = None) -> Tensor:
+                                   sparsity_block_size_from: int, sparsity_block_size_to: int) -> Tensor:
     """Builds the sparsity layout of a block-sparse tensor in compressed form if a different sparsity block size were
         used.
@@ -94,7 +98,6 @@ def build_sparsity_layout_adaption(x: BlksprsTensor, sparsity_layout_from: Tenso
         sparsity_layout_from (Tensor): The sparsity layout of the input block-sparse tensor.
         sparsity_block_size_from (int): The size of the sparsity blocks of the input tensor.
         sparsity_block_size_to (int): The desired size of the sparsity blocks for the resulting layout.
-        triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
     Returns:
         Tensor: The sparsity layout in regular form using the new sparsity block size of the input block-sparse tensor
@@ -107,8 +110,6 @@ def build_sparsity_layout_adaption(x: BlksprsTensor, sparsity_layout_from: Tenso
     validate_sparsity(sparsity_block_size_from, (x, sparsity_layout_from))
     validate_sparsity_block_size(sparsity_block_size_from, x)
     validate_sparsity_block_size(sparsity_block_size_to)
-    min_sparsity_block_size = min(sparsity_block_size_from, sparsity_block_size_to)
-    validate_triton_block_size(triton_block_size, min_sparsity_block_size)
     sparsity_lut = torch.nonzero(sparsity_layout_from).contiguous()
@@ -126,40 +127,44 @@ def build_sparsity_layout_adaption(x: BlksprsTensor, sparsity_layout_from: Tenso
     s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
     o_b_s, o_r_s, o_c_s = stride(output)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size_from)
     triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+                                triton.cdiv(x_r, min(meta["sparsity_block_size_to"], meta["TRITON_BLOCK_SIZE"])),
+                                triton.cdiv(x_c, min(meta["sparsity_block_size_to"], meta["TRITON_BLOCK_SIZE"]))]
-    (kernel_sparsity_layout_adaption[triton_grid]
+    (wrap_triton(build_sparsity_layout_adaption_kernel)[triton_grid]
      (x,
       x_b, x_b_s, x_r_s, x_c_s,
       sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
       output,
       o_b, o_b_s, o_r_s, o_c_s,
       sparsity_block_size_from,
-      sparsity_block_size_to,
-      triton_block_size))
+      sparsity_block_size_to))
     return output
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=[],
+    reset_to_zero=["o"]
+)
 @triton.jit
-def kernel_sparsity_layout_adaption(x,
-                                    x_b, x_b_s, x_r_s, x_c_s,
-                                    s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-                                    o,
-                                    o_b, o_b_s, o_r_s, o_c_s,
-                                    sparsity_block_size_from,
-                                    sparsity_block_size_to,
-                                    TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+def build_sparsity_layout_adaption_kernel(x,
+                                          x_b, x_b_s, x_r_s, x_c_s,
+                                          s_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+                                          o,
+                                          o_b, o_b_s, o_r_s, o_c_s,
+                                          sparsity_block_size_from,
+                                          sparsity_block_size_to,
+                                          TRITON_BLOCK_SIZE: tl.constexpr) -> None:
     # Get triton block indices
     pid_blk = tl.program_id(axis=0)
     pid_row = tl.program_id(axis=1)
     pid_col = tl.program_id(axis=2)
+    # Get valid triton block size
+    val_tbs = min(sparsity_block_size_to, TRITON_BLOCK_SIZE)
     # Get sparsity index of current output block consisting of its batch, row, and column index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
     spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
@@ -175,23 +180,26 @@ def kernel_sparsity_layout_adaption(x,
     # Load x values
     blk_x_idx = ((pid_blk * x_b_s) +
-                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-                 ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
+                 ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                 ((pid_col * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+    blk_x_msk = ((blk_x_idx >= 0 and
+                  blk_x_idx < x_b * x_b_s) and
+                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < sparsity_block_size_from and
+                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < sparsity_block_size_from))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store sparsity layout value
     if tl.min(blk_x) != 0 or tl.max(blk_x) != 0:
         blk_o_idx = ((spa_bat * o_b_s) +
-                     (((spa_row * sparsity_block_size_from + pid_row * TRITON_BLOCK_SIZE)
+                     (((pid_row * val_tbs + spa_row * sparsity_block_size_from)
                        // sparsity_block_size_to) * o_r_s) +
-                     (((spa_col * sparsity_block_size_from + pid_col * TRITON_BLOCK_SIZE)
+                     (((pid_col * val_tbs + spa_col * sparsity_block_size_from)
                        // sparsity_block_size_to) * o_c_s))
         blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)
-def build_sparsity_layout_matmul(sparsity_layout_x: Tensor, sparsity_layout_y: Tensor):
+def build_sparsity_layout_matmul(sparsity_layout_x: Tensor, sparsity_layout_y: Tensor) -> Tensor:
     """Builds the precise sparsity layout of the result of a matrix multiplication between the two input tensors.
     Args:
@@ -225,3 +233,8 @@ def build_sparsity_layout_matmul_fast(sparsity_layout_x: Tensor, sparsity_layout
     sparsity_layout_y_slice = torch.max(sparsity_layout_y, dim=-2).values.unsqueeze(1)
     return torch.logical_or(sparsity_layout_x_slice, sparsity_layout_y_slice)
+def build_sparsity_layout_full(x: Tensor, sparsity_block_size: int) -> Tensor:
+    return torch.ones(size=(x.size(0), x.size(1) // sparsity_block_size, x.size(2) // sparsity_block_size),
+                      dtype=torch.bool, device=x.device)

blksprs 1.11__py3-none-any.whl → 2.0rc1__py3-none-any.whl

blksprs 1.11py3-none-any.whl → 2.0rc1py3-none-any.whl