PyPI - blksprs - Versions diffs - 2.0rc6__py3-none-any.whl → 2.0rc8__py3-none-any.whl - Mend

blksprs 2.0rc6py3-none-any.whl → 2.0rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

blksprs/__init__.py +1 -0
blksprs/layouting/distribution_layout.py +39 -26
blksprs/layouting/sparsity_layout.py +58 -45
blksprs/ops/conversion.py +86 -84
blksprs/ops/distribution.py +81 -79
blksprs/ops/flow.py +64 -60
blksprs/ops/matmul.py +50 -55
blksprs/ops/misc/broadcast_ops.py +29 -27
blksprs/ops/misc/row_wise.py +134 -132
blksprs/ops/partitioning.py +12 -10
blksprs/ops/repeat.py +6 -5
blksprs/ops/softmax.py +55 -47
blksprs/ops/transpose.py +8 -7
blksprs/utils/autotuning.py +10 -10
blksprs/utils/processing.py +0 -1
blksprs/utils/tools.py +8 -9
{blksprs-2.0rc6.dist-info → blksprs-2.0rc8.dist-info}/METADATA +7 -3
blksprs-2.0rc8.dist-info/RECORD +23 -0
{blksprs-2.0rc6.dist-info → blksprs-2.0rc8.dist-info}/WHEEL +1 -1
blksprs-2.0rc6.dist-info/RECORD +0 -23
{blksprs-2.0rc6.dist-info → blksprs-2.0rc8.dist-info}/top_level.txt +0 -0

blksprs/ops/distribution.py CHANGED Viewed

@@ -51,44 +51,45 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
                                         sparsity_block_size))
-@triton_op("blksprs::gather", mutates_args={})
+@triton_op("blksprs::gather_forward", mutates_args={})
 def gather_forward(x: Tensor, sparsity_layout_x: Tensor, sparsity_reverse_lut_x: Tensor,
                    dim: int, i: Tensor, _: Tensor, sparsity_lut_i: Tensor,
                    sparsity_block_size: int) -> Tensor:
-    output = torch.zeros_like(i, dtype=x.dtype)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout_x.size()
-    s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = stride(sparsity_layout_x)
-    i_b, i_r, i_c = i.size()
-    i_b_s, i_r_s, i_c_s = stride(i)
-    s_lut_i_r, s_lut_i_c = sparsity_lut_i.size()
-    s_lut_i_r_s, s_lut_i_c_s = stride(sparsity_lut_i)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(gather_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
-      sparsity_reverse_lut_x,
-      dim,
-      i,
-      i_b, i_b_s, i_r_s, i_c_s,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_lut_i, s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
-      sparsity_block_size))
-    return output
-def gather_backward(ctx, grad_output):
+    with torch.no_grad():
+        output = torch.zeros_like(i, dtype=x.dtype)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout_x.size()
+        s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = stride(sparsity_layout_x)
+        i_b, i_r, i_c = i.size()
+        i_b_s, i_r_s, i_c_s = stride(i)
+        s_lut_i_r, s_lut_i_c = sparsity_lut_i.size()
+        s_lut_i_r_s, s_lut_i_c_s = stride(sparsity_lut_i)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(gather_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
+          sparsity_reverse_lut_x,
+          dim,
+          i,
+          i_b, i_b_s, i_r_s, i_c_s,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_lut_i, s_lut_i_r, s_lut_i_r_s, s_lut_i_c_s,
+          sparsity_block_size))
+        return output
+def gather_wrapper_backward(ctx, grad_output):
     sparsity_layout_x, i, sparsity_layout_i = ctx.saved_tensors
     dim = ctx.dim
     sparsity_block_size = ctx.sparsity_block_size
@@ -221,7 +222,7 @@ def gather_setup_context(ctx, inputs, output):
     ctx.sparsity_block_size = sparsity_block_size
-gather_forward.register_autograd(gather_backward, setup_context=gather_setup_context)
+gather_forward.register_autograd(gather_wrapper_backward, setup_context=gather_setup_context)
 def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
@@ -240,7 +241,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
                           reduce_op="none", lut=lut)
-@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
                    dim: int,
                    idx: BlksprsTensor,
@@ -288,52 +289,53 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
                                                 reduce_op))
-@triton_op("blksprs::scatter_reduce", mutates_args={})
+@triton_op("blksprs::scatter_reduce_forward", mutates_args={})
 def scatter_reduce_forward(x: Tensor, _: Tensor, sparsity_lut_x: Tensor,
                            dim: int, i: Tensor,
                            sparsity_layout_o: Tensor, sparsity_reverse_lut_o: Tensor,
                            sparsity_block_size: int, n_sparse_blocks: int,
                            reduce_op: str) -> Tensor:
-    output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
-                         dtype=x.dtype, device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_lut_x_r, s_lut_x_c = sparsity_lut_x.size()
-    s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut_x)
-    i_b, i_r, i_c = i.size()
-    i_b_s, i_r_s, i_c_s = stride(i)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_o.size()
-    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_o)
-    triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    reduce_op_ind = 0
-    if reduce_op == "sum":
-        reduce_op_ind = 1
-    (wrap_triton(scatter_reduce_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      sparsity_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
-      dim,
-      i,
-      i_b, i_b_s, i_r_s, i_c_s,
-      output,
-      o_b, o_b_s,
-      s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
-      sparsity_reverse_lut_o,
-      reduce_op_ind,
-      sparsity_block_size))
-    return output
-def scatter_reduce_backward(ctx, grad_output):
+    with torch.no_grad():
+        output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+                             dtype=x.dtype, device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_lut_x_r, s_lut_x_c = sparsity_lut_x.size()
+        s_lut_x_r_s, s_lut_x_c_s = stride(sparsity_lut_x)
+        i_b, i_r, i_c = i.size()
+        i_b_s, i_r_s, i_c_s = stride(i)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_o.size()
+        s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_o)
+        triton_grid = lambda meta: [x_b,
+                                    triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+        reduce_op_ind = 0
+        if reduce_op == "sum":
+            reduce_op_ind = 1
+        (wrap_triton(scatter_reduce_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          sparsity_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
+          dim,
+          i,
+          i_b, i_b_s, i_r_s, i_c_s,
+          output,
+          o_b, o_b_s,
+          s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
+          sparsity_reverse_lut_o,
+          reduce_op_ind,
+          sparsity_block_size))
+        return output
+def scatter_reduce_wrapper_backward(ctx, grad_output):
     sparsity_layout_x, i, sparsity_layout_o = ctx.saved_tensors
     dim = ctx.dim
     sparsity_block_size = ctx.sparsity_block_size
@@ -477,4 +479,4 @@ def scatter_reduce_setup_context(ctx, inputs, output):
     ctx.reduce_op = reduce_op
-scatter_reduce_forward.register_autograd(scatter_reduce_backward, setup_context=scatter_reduce_setup_context)
+scatter_reduce_forward.register_autograd(scatter_reduce_wrapper_backward, setup_context=scatter_reduce_setup_context)

blksprs/ops/flow.py CHANGED Viewed

@@ -9,39 +9,41 @@ from blksprs.utils.tools import stride
 from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
-@triton_op("blksprs::flow_pull", mutates_args={})
+@triton_op("blksprs::flow_pull_forward", mutates_args={})
 def flow_pull_forward(x: Tensor, sparsity_layout_o: Tensor,
                       sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
                       sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
-                         dtype=x.dtype, device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_o.size()
-    s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_o)
-    s_lut_r, s_lut_c = sparsity_lut.size()
-    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(flow_pull_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
-      sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-      sparsity_reverse_lut,
-      sparsity_block_size))
-    return output
+    with torch.no_grad():
+        output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+                             dtype=x.dtype, device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_l_o_b, s_l_o_r, s_l_o_c = sparsity_layout_o.size()
+        s_l_o_b_s, s_l_o_r_s, s_l_o_c_s = stride(sparsity_layout_o)
+        s_lut_r, s_lut_c = sparsity_lut.size()
+        s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(flow_pull_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          s_l_o_b, s_l_o_b_s, s_l_o_r_s, s_l_o_c_s,
+          sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+          sparsity_reverse_lut,
+          sparsity_block_size))
+        return output
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
     key=["sparsity_block_size"],
@@ -99,38 +101,40 @@ def flow_pull_kernel(x,
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
-@triton_op("blksprs::flow_push", mutates_args={})
+@triton_op("blksprs::flow_push_forward", mutates_args={})
 def flow_push_forward(x: Tensor, sparsity_layout_x: Tensor, sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
                       sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
-                         dtype=x.dtype, device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout_x.size()
-    s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = stride(sparsity_layout_x)
-    s_lut_r, s_lut_c = sparsity_lut.size()
-    s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    triton_grid = lambda meta: [x_b,
-                                triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(flow_push_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
-      sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
-      sparsity_reverse_lut,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_block_size))
-    return output
+    with torch.no_grad():
+        output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+                             dtype=x.dtype, device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout_x.size()
+        s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = stride(sparsity_layout_x)
+        s_lut_r, s_lut_c = sparsity_lut.size()
+        s_lut_r_s, s_lut_c_s = stride(sparsity_lut)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        triton_grid = lambda meta: [x_b,
+                                    triton.cdiv(x_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(x_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(flow_push_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          s_l_x_b, s_l_x_b_s, s_l_x_r_s, s_l_x_c_s,
+          sparsity_lut, s_lut_r, s_lut_r_s, s_lut_c_s,
+          sparsity_reverse_lut,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_block_size))
+        return output
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
     key=["sparsity_block_size"],

blksprs/ops/matmul.py CHANGED Viewed

@@ -55,53 +55,54 @@ def matmul(x: BlksprsTensor, sparsity_layout_x: Tensor,
                                         sparsity_block_size, lut["n_sparse_blocks"]))
-@triton_op("blksprs::matmul", mutates_args={})
+@triton_op("blksprs::matmul_forward", mutates_args={})
 def matmul_forward(x: Tensor, y: Tensor,
                    sparsity_layout_x: Tensor, sparsity_reverse_lut_x: Tensor,
                    sparsity_layout_y: Tensor, sparsity_reverse_lut_y: Tensor,
                    _: Tensor, sparsity_lut_o: Tensor,
                    sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
-                         dtype=x.dtype, device=x.device)
-    x_b, x_r, x_c = x.size()
-    x_b_s, x_r_s, x_c_s = stride(x)
-    s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout_x.size()
-    s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = stride(sparsity_layout_x)
-    y_b, y_r, y_c = y.size()
-    y_b_s, y_r_s, y_c_s = stride(y)
-    s_l_y_b, s_l_y_r, s_l_y_c = sparsity_layout_y.size()
-    s_l_y_b_s, s_l_y_r_s, s_l_y_c_s = stride(sparsity_layout_y)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_lut_o_r, s_lut_o_c = sparsity_lut_o.size()
-    s_lut_o_r_s, s_lut_o_c_s = stride(sparsity_lut_o)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(matmul_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_r_s, x_c_s,
-      s_l_x_b, s_l_x_b_s, s_l_x_r_s,
-      s_l_x_c, s_l_x_c_s,
-      sparsity_reverse_lut_x,
-      y,
-      y_b, y_b_s, y_r_s, y_c_s,
-      s_l_y_b, s_l_y_b_s, s_l_y_r_s,
-      s_l_y_c_s,
-      sparsity_reverse_lut_y,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_lut_o,
-      s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
-      sparsity_block_size))
-    return output
-def matmul_backward(ctx, grad_output):
+    with torch.no_grad():
+        output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+                             dtype=x.dtype, device=x.device)
+        x_b, x_r, x_c = x.size()
+        x_b_s, x_r_s, x_c_s = stride(x)
+        s_l_x_b, s_l_x_r, s_l_x_c = sparsity_layout_x.size()
+        s_l_x_b_s, s_l_x_r_s, s_l_x_c_s = stride(sparsity_layout_x)
+        y_b, y_r, y_c = y.size()
+        y_b_s, y_r_s, y_c_s = stride(y)
+        s_l_y_b, s_l_y_r, s_l_y_c = sparsity_layout_y.size()
+        s_l_y_b_s, s_l_y_r_s, s_l_y_c_s = stride(sparsity_layout_y)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_lut_o_r, s_lut_o_c = sparsity_lut_o.size()
+        s_lut_o_r_s, s_lut_o_c_s = stride(sparsity_lut_o)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(matmul_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_r_s, x_c_s,
+          s_l_x_b, s_l_x_b_s, s_l_x_r_s,
+          s_l_x_c, s_l_x_c_s,
+          sparsity_reverse_lut_x,
+          y,
+          y_b, y_b_s, y_r_s, y_c_s,
+          s_l_y_b, s_l_y_b_s, s_l_y_r_s,
+          s_l_y_c_s,
+          sparsity_reverse_lut_y,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_lut_o,
+          s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+          sparsity_block_size))
+        return output
+def matmul_wrapper_backward(ctx, grad_output):
     x, sparsity_layout_x, y, sparsity_layout_y, sparsity_layout_o = ctx.saved_tensors
     sparsity_block_size = ctx.sparsity_block_size
@@ -187,20 +188,16 @@ def matmul_kernel(x,
                          ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                          ((i_seg_tri_mod * TRITON_BLOCK_SIZE +
                            tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-            blk_x_msk = ((blk_x_idx >= 0 and
-                          blk_x_idx < x_b * x_b_s) and
-                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < TRITON_BLOCK_SIZE and
-                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < TRITON_BLOCK_SIZE))
+            blk_x_msk = (blk_x_idx >= 0 and
+                         blk_x_idx < x_b * x_b_s)
             blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
             blk_y_idx = ((rev_idx_spa_y * y_b_s) +
                          ((i_seg_tri_mod * TRITON_BLOCK_SIZE +
                            tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
                          ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * y_c_s)[None, :])
-            blk_y_msk = ((blk_y_idx >= 0 and
-                          blk_y_idx < y_b * y_b_s) and
-                         (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < TRITON_BLOCK_SIZE and
-                          tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < TRITON_BLOCK_SIZE))
+            blk_y_msk = (blk_y_idx >= 0 and
+                         blk_y_idx < y_b * y_b_s)
             blk_y = tl.load(y + blk_y_idx, mask=blk_y_msk)
             # Perform matrix multiplication
@@ -213,10 +210,8 @@ def matmul_kernel(x,
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = ((blk_o_idx >= 0 and
-                  blk_o_idx < o_b * o_b_s) and
-                 (tl.arange(0, TRITON_BLOCK_SIZE)[:, None] < TRITON_BLOCK_SIZE and
-                  tl.arange(0, TRITON_BLOCK_SIZE)[None, :] < TRITON_BLOCK_SIZE))
+    blk_o_msk = (blk_o_idx >= 0 and
+                 blk_o_idx < o_b * o_b_s)
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)
@@ -262,4 +257,4 @@ def matmul_setup_context(ctx, inputs, output):
     ctx.sparsity_block_size = sparsity_block_size
-matmul_forward.register_autograd(matmul_backward, setup_context=matmul_setup_context)
+matmul_forward.register_autograd(matmul_wrapper_backward, setup_context=matmul_setup_context)

blksprs/ops/misc/broadcast_ops.py CHANGED Viewed

@@ -12,6 +12,7 @@ from blksprs.utils.validation import validate_contiguous, validate_device, \
     validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
                   sparsity_block_size: int) -> BlksprsTensor:
     """Performs a broadcast and subsequent addition of two dense tensors x and y. Returns a block-sparse tensor in
@@ -54,36 +55,37 @@ def broadcast_sub(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
     return broadcast_add(x, torch.neg(y), sparsity_layout_output, sparsity_block_size)
-@triton_op("blksprs::broadcast_add", mutates_args={})
+@triton_op("blksprs::broadcast_add_forward", mutates_args={})
 def broadcast_add_forward(x: Tensor, y: Tensor,
                           sparsity_lut_o: Tensor,
                           sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    output = torch.zeros(n_sparse_blocks, sparsity_block_size, sparsity_block_size, dtype=x.dtype, device=x.device)
-    x_b, x_c = x.size()
-    x_b_s, x_c_s = stride(x)
-    y_b, y_c = y.size()
-    y_b_s, y_c_s = stride(y)
-    o_b, o_r, o_c = output.size()
-    o_b_s, o_r_s, o_c_s = stride(output)
-    s_lut_o_r, s_lut_o_c = sparsity_lut_o.size()
-    s_lut_o_r_s, s_lut_o_c_s = stride(sparsity_lut_o)
-    triton_grid = lambda meta: [o_b,
-                                triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
-                                triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
-    (wrap_triton(broadcast_add_kernel)[triton_grid]
-     (x,
-      x_b, x_b_s, x_c_s,
-      y,
-      y_b, y_b_s, y_c_s,
-      output,
-      o_b, o_b_s, o_r_s, o_c_s,
-      sparsity_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
-      sparsity_block_size))
-    return BlksprsTensor(output)
+    with torch.no_grad():
+        output = torch.zeros(n_sparse_blocks, sparsity_block_size, sparsity_block_size, dtype=x.dtype, device=x.device)
+        x_b, x_c = x.size()
+        x_b_s, x_c_s = stride(x)
+        y_b, y_c = y.size()
+        y_b_s, y_c_s = stride(y)
+        o_b, o_r, o_c = output.size()
+        o_b_s, o_r_s, o_c_s = stride(output)
+        s_lut_o_r, s_lut_o_c = sparsity_lut_o.size()
+        s_lut_o_r_s, s_lut_o_c_s = stride(sparsity_lut_o)
+        triton_grid = lambda meta: [o_b,
+                                    triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
+                                    triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
+        (wrap_triton(broadcast_add_kernel)[triton_grid]
+         (x,
+          x_b, x_b_s, x_c_s,
+          y,
+          y_b, y_b_s, y_c_s,
+          output,
+          o_b, o_b_s, o_r_s, o_c_s,
+          sparsity_lut_o, s_lut_o_r, s_lut_o_r_s, s_lut_o_c_s,
+          sparsity_block_size))
+        return output
 @triton.autotune(

blksprs 2.0rc6__py3-none-any.whl → 2.0rc8__py3-none-any.whl

blksprs 2.0rc6py3-none-any.whl → 2.0rc8py3-none-any.whl