PyPI - blksprs - Versions diffs - 2.0rc7__py3-none-any.whl → 2.0rc8__py3-none-any.whl - Mend

blksprs 2.0rc7py3-none-any.whl → 2.0rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

blksprs/__init__.py +1 -0
blksprs/layouting/distribution_layout.py +39 -26
blksprs/layouting/sparsity_layout.py +58 -45
blksprs/ops/conversion.py +86 -84
blksprs/ops/distribution.py +80 -78
blksprs/ops/flow.py +64 -60
blksprs/ops/matmul.py +50 -55
blksprs/ops/misc/broadcast_ops.py +28 -27
blksprs/ops/misc/row_wise.py +123 -125
blksprs/ops/partitioning.py +12 -10
blksprs/ops/repeat.py +6 -5
blksprs/ops/softmax.py +55 -47
blksprs/ops/transpose.py +8 -7
blksprs/utils/autotuning.py +10 -10
blksprs/utils/processing.py +0 -1
blksprs/utils/tools.py +8 -0
{blksprs-2.0rc7.dist-info → blksprs-2.0rc8.dist-info}/METADATA +1 -1
blksprs-2.0rc8.dist-info/RECORD +23 -0
{blksprs-2.0rc7.dist-info → blksprs-2.0rc8.dist-info}/WHEEL +1 -1
blksprs-2.0rc7.dist-info/RECORD +0 -23
{blksprs-2.0rc7.dist-info → blksprs-2.0rc8.dist-info}/top_level.txt +0 -0

blksprs/ops/misc/row_wise.py CHANGED Viewed

@@ -60,39 +60,40 @@ def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
         sparsity_block_size, n_sparse_blocks_output, flag_slice_only)), sparsity_layout_output
-@triton_op("blksprs::row_wise_sum", mutates_args={})
+@triton_op("blksprs::row_wise_sum_forward", mutates_args={})
 def row_wise_sum_forward(x: Tensor, sparsity_lut: Tensor,
                          sparsity_layout_output: Tensor, sparsity_reverse_lut_output: Tensor,
                          sparsity_block_size: int, n_sparse_blocks_output: int,
                          flag_slice_only: bool = False) -> Tensor:
-    output = torch.zeros(
-        size=(n_sparse_blocks_output, sparsity_block_size, 1 if flag_slice_only else sparsity_block_size),
-        dtype=x.dtype, device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_x_r, s_lut_x_c = sparsity_lut.size()
-    s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
-    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
-    triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(row_wise_sum_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
-      output,
-      o_b, o_b_s, o_r_s,
-      s_l_o_b, s_l_o_b_s, s_l_o_r_s,
-      sparsity_reverse_lut_output,
-      sparsity_block_size))
-    return output
+    with torch.no_grad():
+        output = torch.zeros(
+            size=(n_sparse_blocks_output, sparsity_block_size, 1 if flag_slice_only else sparsity_block_size),
+            dtype=x.dtype, device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_lut_x_r, s_lut_x_c = sparsity_lut.size()
+        s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
+        s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
+        triton_grid = lambda meta: [x_b,
+                                    triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(row_wise_sum_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          output,
+          o_b, o_b_s, o_r_s,
+          s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+          sparsity_reverse_lut_output,
+          sparsity_block_size))
+        return output
 # noinspection PyUnusedLocal
@@ -132,25 +133,22 @@ def row_wise_sum_kernel(x,
     rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
-    if rev_idx_spa == -1:
-        tl.device_assert(False)
-        return
-    blk_idx = ((pid_blk * x_b_s) +
-               ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-               ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = (blk_idx >= 0 and
-               blk_idx < x_b * x_b_s)
-    blk = tl.load(x + blk_idx, mask=blk_msk)
+    if rev_idx_spa >= 0:
+        blk_idx = ((pid_blk * x_b_s) +
+                   ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                   ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_msk = (blk_idx >= 0 and
+                   blk_idx < x_b * x_b_s)
+        blk = tl.load(x + blk_idx, mask=blk_msk)
-    buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
+        buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
-    o_idx = (rev_idx_spa * o_b_s +
-             ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-             (tl.arange(0, 1))[None, :])
-    o_msk = (o_idx >= 0 and
-             o_idx < o_b * o_b_s)
-    tl.atomic_add(o + o_idx, buf, o_msk)
+        o_idx = (rev_idx_spa * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 (tl.arange(0, 1))[None, :])
+        o_msk = (o_idx >= 0 and
+                 o_idx < o_b * o_b_s)
+        tl.atomic_add(o + o_idx, buf, o_msk)
 @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -176,7 +174,7 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
             of the input and the sparsity layout of the output tensor.
     """
-    # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376
+    # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376, should be fixed with the upcoming 3.4.0 release
     x = torch.where(x == -0.0, torch.tensor(0.0), x)
     x = x.contiguous()
@@ -204,41 +202,42 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
                              n_sparse_blocks_output, flag_slice_only)), sparsity_layout_output
-@triton_op("blksprs::row_wise_max", mutates_args={})
+@triton_op("blksprs::row_wise_max_forward", mutates_args={})
 def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
                          sparsity_layout_output: Tensor, sparsity_reverse_lut_output: Tensor,
                          sparsity_block_size: int, n_sparse_blocks_output: int,
                          flag_slice_only: bool = False) -> Tensor:
-    output = torch.full(size=(n_sparse_blocks_output,
-                              sparsity_block_size,
-                              1 if flag_slice_only else sparsity_block_size),
-                        fill_value=torch.finfo(x.dtype).min,
-                        device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_x_r, s_lut_x_c = sparsity_lut.size()
-    s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
-    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
-    triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(row_wise_max_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
-      output,
-      o_b, o_b_s, o_r_s,
-      s_l_o_b, s_l_o_b_s, s_l_o_r_s,
-      sparsity_reverse_lut_output,
-      sparsity_block_size))
-    return output
+    with torch.no_grad():
+        output = torch.full(size=(n_sparse_blocks_output,
+                                  sparsity_block_size,
+                                  1 if flag_slice_only else sparsity_block_size),
+                            fill_value=torch.finfo(x.dtype).min,
+                            device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_lut_x_r, s_lut_x_c = sparsity_lut.size()
+        s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_output.size()
+        s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_output)
+        triton_grid = lambda meta: [x_b,
+                                    triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(row_wise_max_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          output,
+          o_b, o_b_s, o_r_s,
+          s_l_o_b, s_l_o_b_s, s_l_o_r_s,
+          sparsity_reverse_lut_output,
+          sparsity_block_size))
+        return output
 # noinspection PyUnusedLocal
@@ -278,25 +277,22 @@ def row_wise_max_kernel(x,
     rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
-    if rev_idx_spa == -1:
-        tl.device_assert(False)
-        return
-    blk_idx = ((pid_blk * x_b_s) +
-               ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
-               ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_msk = (blk_idx >= 0 and
-               blk_idx < x_b * x_b_s)
-    blk = tl.load(x + blk_idx, mask=blk_msk)
+    if rev_idx_spa >= 0:
+        blk_idx = ((pid_blk * x_b_s) +
+                   ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
+                   ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
+        blk_msk = (blk_idx >= 0 and
+                   blk_idx < x_b * x_b_s)
+        blk = tl.load(x + blk_idx, mask=blk_msk)
-    buf = tl.reshape(tl.max(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
+        buf = tl.reshape(tl.max(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
-    o_idx = (rev_idx_spa * o_b_s +
-             ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
-             (tl.arange(0, 1))[None, :])
-    o_msk = (o_idx >= 0 and
-             o_idx < o_b * o_b_s)
-    tl.atomic_max(o + o_idx, buf, o_msk)
+        o_idx = (rev_idx_spa * o_b_s +
+                 ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
+                 (tl.arange(0, 1))[None, :])
+        o_msk = (o_idx >= 0 and
+                 o_idx < o_b * o_b_s)
+        tl.atomic_max(o + o_idx, buf, o_msk)
 @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -343,41 +339,43 @@ def row_wise_sub(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
     return row_wise_add(x, sparsity_layout_x, torch.neg(y), sparsity_block_size)
-@triton_op("blksprs::row_wise_add", mutates_args={})
+@triton_op("blksprs::row_wise_add_forward", mutates_args={})
 def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
                          sparsity_layout_x_rwm: Tensor, sparsity_reverse_x_lut_rwm: Tensor,
                          y: Tensor, sparsity_block_size: int) -> Tensor:
-    output = torch.zeros_like(x)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_r, s_lut_c = sparsity_lut_x.size()
-    s_lut_r_s, s_lut_c_s = stride(sparsity_lut_x)
-    y_b, y_r, y_c = y.size()
-    y_b_s, y_r_s, y_c_s = stride(y)
-    s_l_y_b, s_l_y_r, s_l_y_c = sparsity_layout_x_rwm.size()
-    s_l_y_b_s, s_l_y_r_s, s_l_y_c_s = stride(sparsity_layout_x_rwm)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(kernel_blocksparse_row_wise_add)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      sparsity_lut_x, s_lut_r, s_lut_r_s, s_lut_c_s,
-      y, y_b, y_b_s, y_r_s, y_c_s,
-      s_l_y_b, s_l_y_b_s, s_l_y_r_s,
-      sparsity_reverse_x_lut_rwm,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_block_size))
-    return output
+    with torch.no_grad():
+        output = torch.zeros_like(x)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_lut_r, s_lut_c = sparsity_lut_x.size()
+        s_lut_r_s, s_lut_c_s = stride(sparsity_lut_x)
+        y_b, y_r, y_c = y.size()
+        y_b_s, y_r_s, y_c_s = stride(y)
+        s_l_y_b, s_l_y_r, s_l_y_c = sparsity_layout_x_rwm.size()
+        s_l_y_b_s, s_l_y_r_s, s_l_y_c_s = stride(sparsity_layout_x_rwm)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(kernel_blocksparse_row_wise_add)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut_x, s_lut_r, s_lut_r_s, s_lut_c_s,
+          y, y_b, y_b_s, y_r_s, y_c_s,
+          s_l_y_b, s_l_y_b_s, s_l_y_r_s,
+          sparsity_reverse_x_lut_rwm,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_block_size))
+        return output
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
     key=["sparsity_block_size"],

blksprs/ops/partitioning.py CHANGED Viewed

@@ -46,14 +46,15 @@ def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
         partitions, adjusted_dim, sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_output"]
-@triton_op("blksprs::split", mutates_args={})
+@triton_op("blksprs::split_forward", mutates_args={})
 def split_forward(x: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
                   _: int, __: int, sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    return flow_pull_forward(x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
-                             n_sparse_blocks)
+    with torch.no_grad():
+        return flow_pull_forward(x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+                                 n_sparse_blocks)
-def split_backward(ctx, grad_output):
+def split_wrapper_backward(ctx, grad_output):
     sparsity_layout = ctx.saved_tensors[0]
     num_partitions = ctx.num_partitions
     dim = ctx.dim
@@ -109,7 +110,7 @@ def split_setup_context(ctx, inputs, output):
     ctx.sparsity_block_size = sparsity_block_size
-split_forward.register_autograd(split_backward, setup_context=split_setup_context)
+split_forward.register_autograd(split_wrapper_backward, setup_context=split_setup_context)
 @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -150,14 +151,15 @@ def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
         partitions, adjusted_dim, sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_output"]
-@triton_op("blksprs::merge", mutates_args={})
+@triton_op("blksprs::merge_forward", mutates_args={})
 def merge_forward(x: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
                   _: int, __: int, sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    return flow_pull_forward(x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
-                             n_sparse_blocks)
+    with torch.no_grad():
+        return flow_pull_forward(x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+                                 n_sparse_blocks)
-def merge_backward(ctx, grad_output):
+def merge_wrapper_backward(ctx, grad_output):
     sparsity_layout = ctx.saved_tensors[0]
     num_partitions = ctx.num_partitions
     dim = ctx.dim
@@ -216,4 +218,4 @@ def merge_setup_context(ctx, inputs, output):
     ctx.sparsity_block_size = sparsity_block_size
-merge_forward.register_autograd(merge_backward, setup_context=merge_setup_context)
+merge_forward.register_autograd(merge_wrapper_backward, setup_context=merge_setup_context)

blksprs/ops/repeat.py CHANGED Viewed

@@ -92,15 +92,16 @@ def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
         lut["sparsity_reverse_lut"], sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_o"]
-@triton_op("blksprs::repeat", mutates_args={})
+@triton_op("blksprs::repeat_forward", mutates_args={})
 def repeat_forward(x: Tensor, _: Tensor, sparsity_layout_o: Tensor, sparsity_lut: Tensor,
                    sparsity_reverse_lut: Tensor,
                    sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    return flow_pull_forward(x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
-                             n_sparse_blocks)
+    with torch.no_grad():
+        return flow_pull_forward(x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut, sparsity_block_size,
+                                 n_sparse_blocks)
-def repeat_backward(ctx, grad_output):
+def repeat_wrapper_backward(ctx, grad_output):
     sparsity_layout_x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut = ctx.saved_tensors
     sparsity_block_size = ctx.sparsity_block_size
     n_sparse_blocks = torch.sum(sparsity_layout_x.to(torch.int)).item()
@@ -190,4 +191,4 @@ def repeat_setup_context(ctx, inputs, output):
     ctx.sparsity_block_size = sparsity_block_size
-repeat_forward.register_autograd(repeat_backward, setup_context=repeat_setup_context)
+repeat_forward.register_autograd(repeat_wrapper_backward, setup_context=repeat_setup_context)

blksprs/ops/softmax.py CHANGED Viewed

@@ -47,19 +47,13 @@ def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
                                          sparsity_block_size))
-@triton_op("blksprs::softmax", mutates_args={})
+@triton_op("blksprs::softmax_forward", mutates_args={})
 def softmax_forward(x: Tensor, sparsity_layout: Tensor,
                     sparsity_lut: Tensor,
                     sparsity_reverse_lut_rws: Tensor,
                     sparsity_block_size: int) -> Tensor:
     output = torch.zeros_like(x)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_r, s_lut_c = sparsity_lut.size()
-    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
-    o_b, o_r, o_c = output.size()
     x_row_wise_max, sparsity_layout_rwm = row_wise_max(x, sparsity_layout, sparsity_block_size,
                                                        flag_slice_only=True)
     x_scaled = row_wise_sub(x, sparsity_layout, x_row_wise_max, sparsity_block_size)
@@ -67,6 +61,11 @@ def softmax_forward(x: Tensor, sparsity_layout: Tensor,
     x_exp_row_wise_sum, sparsity_layout_rws = row_wise_sum(x_exp, sparsity_layout, sparsity_block_size,
                                                            flag_slice_only=True)
+    x_b, x_r, x_c = x.size()
+    x_b_s, x_r_s, x_c_s = stride(x)
+    s_lut_r, s_lut_c = sparsity_lut.size()
+    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
+    o_b, o_r, o_c = output.size()
     s_b, s_r, s_c = x_exp_row_wise_sum.shape
     s_b_s, s_r_s, s_c_s = stride(x_exp_row_wise_sum)
     s_l_s_b, s_l_s_r, s_l_s_c = sparsity_layout_rws.shape
@@ -89,50 +88,58 @@ def softmax_forward(x: Tensor, sparsity_layout: Tensor,
     return output
-def softmax_backward(ctx, grad_output):
+def softmax_backward_wrapper(ctx, grad_output):
     o, sparsity_layout, sparsity_lut = ctx.saved_tensors
     sparsity_block_size = ctx.sparsity_block_size
-    s, sparsity_layout_s = row_wise_sum(grad_output * o, sparsity_layout, sparsity_block_size, flag_slice_only=True)
-    sparsity_layout_s_flat = sparsity_layout_s.reshape(-1)
-    sparsity_reverse_lut_s = ((torch.cumsum(sparsity_layout_s_flat, dim=-1) - 1) *
-                              (sparsity_layout_s_flat == 1) -
-                              (1 * (sparsity_layout_s_flat == 0)))
-    o_b, o_r, o_c = o.size()
-    o_b_s, o_r_s, o_c_s = stride(o)
-    s_lut_r, s_lut_c = sparsity_lut.size()
-    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
-    s_b, s_r, s_c = s.size()
-    s_b_s, s_r_s, s_c_s = stride(s)
-    s_l_s_b, s_l_s_r, s_l_s_c = sparsity_layout_s.size()
-    s_l_s_b_s, s_l_s_r_s, s_l_s_c_s = stride(sparsity_layout_s)
-    grad_x = torch.zeros_like(o, dtype=torch.float)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    # TODO wrap
-    (softmax_kernel_grad[triton_grid]
-     (grad_output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      o,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-      s,
-      s_b, s_b_s, s_r_s, s_c_s,
-      s_l_s_b, s_l_s_b_s, s_l_s_r_s,
-      sparsity_reverse_lut_s,
-      grad_x,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_block_size))
-    return grad_x, None, None, None, None, None
+    return softmax_backward(grad_output, o, sparsity_lut, sparsity_layout,
+                            sparsity_block_size), None, None, None, None, None
+@triton_op("blksprs::softmax_backward", mutates_args={})
+def softmax_backward(grad_output: Tensor, o: Tensor, sparsity_lut: Tensor, sparsity_layout: Tensor,
+                     sparsity_block_size: int) -> Tensor:
+    with torch.no_grad():
+        s, sparsity_layout_s = row_wise_sum(grad_output * o, sparsity_layout, sparsity_block_size, flag_slice_only=True)
+        sparsity_layout_s_flat = sparsity_layout_s.reshape(-1)
+        sparsity_reverse_lut_s = ((torch.cumsum(sparsity_layout_s_flat, dim=-1) - 1) *
+                                  (sparsity_layout_s_flat == 1) -
+                                  (1 * (sparsity_layout_s_flat == 0)))
+        o_b, o_r, o_c = o.size()
+        o_b_s, o_r_s, o_c_s = stride(o)
+        s_lut_r, s_lut_c = sparsity_lut.size()
+        s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
+        s_b, s_r, s_c = s.size()
+        s_b_s, s_r_s, s_c_s = stride(s)
+        s_l_s_b, s_l_s_r, s_l_s_c = sparsity_layout_s.size()
+        s_l_s_b_s, s_l_s_r_s, s_l_s_c_s = stride(sparsity_layout_s)
+        grad_x = torch.zeros_like(o, dtype=torch.float)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(softmax_kernel_grad)[triton_grid]
+         (grad_output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          o,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+          s,
+          s_b, s_b_s, s_r_s, s_c_s,
+          s_l_s_b, s_l_s_b_s, s_l_s_r_s,
+          sparsity_reverse_lut_s,
+          grad_x,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_block_size))
+        return grad_x
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
     key=["sparsity_block_size"],
@@ -193,6 +200,7 @@ def softmax_kernel(x,
         tl.store(o + blk_x_idx, buf, mask=blk_x_msk)
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
     key=["sparsity_block_size"],
@@ -293,4 +301,4 @@ def softmax_setup_context(ctx, inputs, output):
     ctx.sparsity_block_size = sparsity_block_size
-softmax_forward.register_autograd(softmax_backward, setup_context=softmax_setup_context)
+softmax_forward.register_autograd(softmax_backward_wrapper, setup_context=softmax_setup_context)

blksprs/ops/transpose.py CHANGED Viewed

@@ -28,7 +28,6 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor,
     """
     x = x.contiguous()
-    x_t = x.transpose(-1, -2).contiguous()
     validate_dimensions(x)
     validate_contiguous(x)
@@ -38,20 +37,22 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor,
     lut = transpose_build_lut(lut, sparsity_layout)
-    return BlksprsTensor(transpose_forward(x_t, lut["sparsity_layout_t"],
+    return BlksprsTensor(transpose_forward(x, lut["sparsity_layout_t"],
                                            lut["sparsity_lut"], lut["sparsity_reverse_lut"],
                                            sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_t"]
-@triton_op("blksprs::transpose", mutates_args={})
+@triton_op("blksprs::transpose_forward", mutates_args={})
 def transpose_forward(x: Tensor, sparsity_layout_o: Tensor,
                       sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
                       sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    return flow_pull_forward(x, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut,
-                             sparsity_block_size, n_sparse_blocks)
+    with torch.no_grad():
+        x_t = x.transpose(-1, -2).contiguous()
+        return flow_pull_forward(x_t, sparsity_layout_o, sparsity_lut, sparsity_reverse_lut,
+                                 sparsity_block_size, n_sparse_blocks)
-def transpose_backward(ctx, grad_output):
+def transpose_wrapper_backward(ctx, grad_output):
     sparsity_layout = ctx.saved_tensors[0]
     sparsity_block_size = ctx.sparsity_block_size
@@ -96,4 +97,4 @@ def transpose_setup_context(ctx, inputs, output):
     ctx.sparsity_block_size = sparsity_block_size
-transpose_forward.register_autograd(transpose_backward, setup_context=transpose_setup_context)
+transpose_forward.register_autograd(transpose_wrapper_backward, setup_context=transpose_setup_context)

blksprs/utils/autotuning.py CHANGED Viewed

@@ -2,15 +2,7 @@ import os
 blksprs_autotune_mode = os.getenv("BLKSPRS_AUTOTUNE", "DEFAULT")
-if blksprs_autotune_mode == "TEST":
-    autotune_parameters = [
-        (16, 3, 8),
-        (32, 3, 8),
-        (64, 3, 8),
-    ]
-elif blksprs_autotune_mode == "DEFAULT":
+if blksprs_autotune_mode == "DEFAULT":
     autotune_parameters = [
         (16, 3, 8),
         (16, 4, 4),
@@ -28,6 +20,14 @@ elif blksprs_autotune_mode == "DEFAULT":
         (128, 4, 4),
         (128, 5, 2),
     ]
+elif blksprs_autotune_mode == "TEST":
+    autotune_parameters = [
+        (16, 3, 8),
+        (32, 3, 8),
+        (64, 3, 8),
+    ]
 else:
     raise NotImplementedError(f"Unknown autotune mode: {blksprs_autotune_mode}")
@@ -75,4 +75,4 @@ def get_autotune_configs():
         autotune_configs.append(
             triton.Config({"TRITON_BLOCK_SIZE": block_size}, num_stages=num_stages, num_warps=num_warps))
-    return autotune_configs
+    return autotune_configs

blksprs/utils/processing.py CHANGED Viewed

@@ -26,7 +26,6 @@ def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block
     # Apply weights
     sparsity_layout_xw = build_sparsity_layout_matmul_fast(sparsity_layout, sparsity_layout_w_t)
-    # TODO At the moment, manual cast is needed. Bug with custom_fwd?
     xw = matmul(x, sparsity_layout, BlksprsTensor(w_t_bs.to(x.dtype)), sparsity_layout_w_t, sparsity_layout_xw, sparsity_block_size)
     interim = xw

blksprs/utils/tools.py CHANGED Viewed

@@ -1,3 +1,6 @@
+import tomllib
+from pathlib import Path
 import torch
 from torch import Tensor, Size
@@ -5,6 +8,11 @@ from torch import Tensor, Size
 torch._dynamo.config.capture_scalar_outputs = True
+def version():
+    with open(Path(__file__).parent.parent.parent.joinpath("pyproject.toml"), "rb") as f:
+        return tomllib.load(f)["project"]["version"]
 def do_shape_blocksparse(x: Tensor):
     if x.dim() == 3:
         return x.contiguous(), x.size()

blksprs 2.0rc7__py3-none-any.whl → 2.0rc8__py3-none-any.whl

blksprs 2.0rc7py3-none-any.whl → 2.0rc8py3-none-any.whl