PyPI - blksprs - Versions diffs - 1.10.2__py3-none-any.whl → 2.0__py3-none-any.whl - Mend

blksprs 1.10.2py3-none-any.whl → 2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

blksprs/__init__.py +4 -6
blksprs/layouting/distribution_layout.py +64 -48
blksprs/layouting/sparsity_layout.py +96 -72
blksprs/ops/conversion.py +350 -312
blksprs/ops/distribution.py +320 -266
blksprs/ops/flow.py +135 -89
blksprs/ops/matmul.py +184 -151
blksprs/ops/misc/broadcast_ops.py +68 -53
blksprs/ops/misc/row_wise.py +223 -176
blksprs/ops/partitioning.py +140 -89
blksprs/ops/repeat.py +118 -108
blksprs/ops/softmax.py +201 -167
blksprs/ops/transpose.py +71 -131
blksprs/utils/autotuning.py +78 -0
blksprs/utils/benchmarking.py +3 -3
blksprs/utils/processing.py +2 -1
blksprs/utils/tools.py +5 -6
blksprs/utils/validation.py +22 -16
{blksprs-1.10.2.dist-info → blksprs-2.0.dist-info}/METADATA +55 -36
blksprs-2.0.dist-info/RECORD +23 -0
{blksprs-1.10.2.dist-info → blksprs-2.0.dist-info}/WHEEL +1 -1
blksprs/ops/misc/exp.py +0 -104
blksprs/utils/layout_utils.py +0 -17
blksprs-1.10.2.dist-info/RECORD +0 -24
{blksprs-1.10.2.dist-info → blksprs-2.0.dist-info}/top_level.txt +0 -0

blksprs/ops/misc/broadcast_ops.py CHANGED Viewed

@@ -1,16 +1,20 @@
 import torch
 import triton
 from torch import Tensor
+from torch._library import triton_op
+from torch._library.triton import wrap_triton
 from triton import language as tl
 from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import get_triton_block_size, stride
+from blksprs.utils.tools import stride
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_device, \
-    validate_sparsity_block_size, validate_triton_block_size
+    validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
-                  sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
+                  sparsity_block_size: int) -> BlksprsTensor:
     """Performs a broadcast and subsequent addition of two dense tensors x and y. Returns a block-sparse tensor in
         compressed form.
@@ -19,7 +23,6 @@ def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
         y (Tensor): A dense input tensor.
         sparsity_layout_output (Tensor): The sparsity layout of the output tensor.
         sparsity_block_size (int): The size of the sparsity blocks.
-        triton_block_size (int, optional): The block size to use for the triton kernel (default ``None``).
     Returns:
         BlksprsTensor: The result of the operation as a block-sparse tensor in compressed form. Each element o(i, j) of the
@@ -34,7 +37,6 @@ def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
     if x.size(-1) != y.size(-1):
         raise ValueError("Dimensions of tensors must match")
     validate_sparsity_block_size(sparsity_block_size)
-    validate_triton_block_size(triton_block_size, sparsity_block_size)
     sparsity_lut_o = torch.nonzero(sparsity_layout_output).contiguous()
@@ -42,56 +44,66 @@ def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
     validate_contiguous(sparsity_layout_output, sparsity_lut_o)
-    output = torch.zeros(n_sparse_blocks, sparsity_block_size, sparsity_block_size, dtype=x.dtype, device=x.device)
-    x_b, x_c = x.size()
-    x_b_s, x_c_s = stride(x)
-    y_b, y_c = y.size()
-    y_b_s, y_c_s = stride(y)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_lut_o_r, s_lut_o_c = sparsity_lut_o.size()
-    s_lut_o_r_s, s_lut_o_c_s = stride(sparsity_lut_o)
-    if triton_block_size is None:
-        triton_block_size = get_triton_block_size(sparsity_block_size)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (kernel_broadcast_addition[triton_grid]
-     (x,
-      x_b, x_b_s, x_c_s,
-      y,
-      y_b, y_b_s, y_c_s,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
-      sparsity_block_size,
-      triton_block_size))
-    return BlksprsTensor(output)
+    return BlksprsTensor(broadcast_add_forward(x, y, sparsity_lut_o, sparsity_block_size, n_sparse_blocks))
 def broadcast_sub(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
-                  sparsity_block_size: int, triton_block_size: int = None) -> BlksprsTensor:
+                  sparsity_block_size: int) -> BlksprsTensor:
     """Wrapper for ``broadcast_add`` with negated y.
     """
-    return broadcast_add(x, torch.neg(y), sparsity_layout_output, sparsity_block_size, triton_block_size)
+    return broadcast_add(x, torch.neg(y), sparsity_layout_output, sparsity_block_size)
+@triton_op("blksprs::broadcast_add_forward", mutates_args={})
+def broadcast_add_forward(x: Tensor, y: Tensor,
+                          sparsity_lut_o: Tensor,
+                          sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
+    with torch.no_grad():
+        output = torch.zeros(n_sparse_blocks, sparsity_block_size, sparsity_block_size, dtype=x.dtype, device=x.device)
+        x_b, x_c = x.size()
+        x_b_s, x_c_s = stride(x)
+        y_b, y_c = y.size()
+        y_b_s, y_c_s = stride(y)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_lut_o_r, s_lut_o_c = sparsity_lut_o.size()
+        s_lut_o_r_s, s_lut_o_c_s = stride(sparsity_lut_o)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(broadcast_add_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_c_s,
+          y,
+          y_b, y_b_s, y_c_s,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+          sparsity_block_size))
+        return output
+@triton.autotune(
+    configs=get_autotune_configs(),
+    key=["sparsity_block_size"],
+    prune_configs_by={"early_config_prune": prune_autotune_configs},
+    reset_to_zero=["o"]
+)
 @triton.jit
-def kernel_broadcast_addition(x,
-                              x_b, x_b_s, x_c_s,
-                              y,
-                              y_b, y_b_s, y_c_s,
-                              o,
-                              o_b, o_b_s, o_r_s, o_c_s,
-                              s_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
-                              sparsity_block_size,
-                              TRITON_BLOCK_SIZE: tl.constexpr) -> None:
+def broadcast_add_kernel(x,
+                         x_b, x_b_s, x_c_s,
+                         y,
+                         y_b, y_b_s, y_c_s,
+                         o,
+                         o_b, o_b_s, o_r_s, o_c_s,
+                         s_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+                         sparsity_block_size,
+                         TRITON_BLOCK_SIZE: tl.constexpr) -> None:
     # Get triton block indices
     pid_blk = tl.program_id(axis=0)
     pid_row = tl.program_id(axis=1)
@@ -112,16 +124,18 @@ def kernel_broadcast_addition(x,
     # Load x block
     blk_x_idx = (spa_bat_o * x_b_s +
-                 ((spa_row_o * sparsity_block_size + pid_row * TRITON_BLOCK_SIZE +
+                 ((pid_row * TRITON_BLOCK_SIZE + spa_row_o * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
+    blk_x_msk = (blk_x_idx >= 0 and
+                 blk_x_idx < x_b * x_b_s)
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load y block
     blk_y_idx = (spa_bat_o * y_b_s +
-                 ((spa_col_o * sparsity_block_size + pid_col * TRITON_BLOCK_SIZE +
+                 ((pid_col * TRITON_BLOCK_SIZE + spa_col_o * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * y_c_s)[None, :])
-    blk_y_msk = (blk_y_idx >= 0 and blk_y_idx < y_b * y_b_s)
+    blk_y_msk = (blk_y_idx >= 0 and
+                 blk_y_idx < y_b * y_b_s)
     blk_y = tl.load(y + blk_y_idx, mask=blk_y_msk)
     # Compute sum
@@ -132,5 +146,6 @@ def kernel_broadcast_addition(x,
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
+    blk_o_msk = (blk_o_idx >= 0 and
+                 blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

blksprs 1.10.2__py3-none-any.whl → 2.0__py3-none-any.whl

blksprs 1.10.2py3-none-any.whl → 2.0py3-none-any.whl