PyPI - blksprs - Versions diffs - 2.0rc2__tar.gz → 2.0rc4__tar.gz - Mend

blksprs 2.0rc2tar.gz → 2.0rc4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{blksprs-2.0rc2 → blksprs-2.0rc4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.0rc2
+Version: 2.0rc4
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -27,7 +27,7 @@ Requires-Dist: matplotlib; extra == "test"
   ### News
   🎉 ***Version 2.0 released***. blksprs now supports kernel auto-tuning, JIT compilation, specification of pre-calculated
-  LUTs, and makes use of `torch.library.triton_op()`!
+  LUTs, autocasting, and makes use of `torch.library.triton_op()`!
 ---

{blksprs-2.0rc2 → blksprs-2.0rc4}/README.md RENAMED Viewed

@@ -8,7 +8,7 @@
   ### News
   🎉 ***Version 2.0 released***. blksprs now supports kernel auto-tuning, JIT compilation, specification of pre-calculated
-  LUTs, and makes use of `torch.library.triton_op()`!
+  LUTs, autocasting, and makes use of `torch.library.triton_op()`!
 ---

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/layouting/sparsity_layout.py RENAMED Viewed

@@ -12,6 +12,7 @@ from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous, validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def build_sparsity_layout(x: Tensor, sparsity_block_size: int) -> Tensor:
     """Builds the sparsity layout of a dense tensor in regular form covering its sparse blocks.
@@ -199,6 +200,7 @@ def build_sparsity_layout_adaption_kernel(x,
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def build_sparsity_layout_matmul(sparsity_layout_x: Tensor, sparsity_layout_y: Tensor) -> Tensor:
     """Builds the precise sparsity layout of the result of a matrix multiplication between the two input tensors.
@@ -213,6 +215,7 @@ def build_sparsity_layout_matmul(sparsity_layout_x: Tensor, sparsity_layout_y: T
     return torch.matmul(sparsity_layout_x.to(torch.float), sparsity_layout_y.to(torch.float)).to(torch.bool)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def build_sparsity_layout_matmul_fast(sparsity_layout_x: Tensor, sparsity_layout_y: Tensor):
     """Builds the approximate sparsity layout of the result of a matrix multiplication between the two input tensors.

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/conversion.py RENAMED Viewed

@@ -18,6 +18,7 @@ def to_blksprs(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int) ->
     return to_sparse(x, sparsity_layout, sparsity_block_size)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def to_sparse(x: Tensor, sparsity_layout: Tensor,
               sparsity_block_size: int, lut: dict = None) -> BlksprsTensor:
     """Converts a block-sparse tensor in regular form to a block-sparse tensor in compressed form based on the given
@@ -53,7 +54,7 @@ def to_sparse(x: Tensor, sparsity_layout: Tensor,
 @triton_op("blksprs::to_sparse", mutates_args={})
 def to_sparse_forward(x: Tensor, _: Tensor,
                       sparsity_lut: Tensor, sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    output = torch.empty(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+    output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
                          dtype=x.dtype, device=x.device)
     x_b, x_r, x_c = x.size()
@@ -86,6 +87,7 @@ def to_sparse_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
     key=[],
+    reset_to_zero=["o"]
 )
 @triton.jit
 def to_sparse_kernel(x,
@@ -175,6 +177,7 @@ def from_blksprs(x: BlksprsTensor, sparsity_layout: Tensor,
     return to_dense(x, sparsity_layout, sparsity_block_size, fill_value=fill_value, lut=lut)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def to_dense(x: BlksprsTensor, sparsity_layout: Tensor,
              sparsity_block_size: int, fill_value: float = 0, lut: dict = None) -> Tensor:
     """Converts a block-sparse tensor in compressed form to a block-sparse tensor in regular form based on the given
@@ -250,6 +253,7 @@ def to_dense_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
     key=[],
+    restore_value=["o"]
 )
 @triton.jit
 def to_dense_kernel(x,
@@ -326,6 +330,7 @@ def to_dense_setup_context(ctx, inputs, output):
 to_dense_forward.register_autograd(to_dense_backward, setup_context=to_dense_setup_context)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_size_from: int,
                  sparsity_block_size_to: int, sparsity_layout_to: Tensor = None) -> (BlksprsTensor, Tensor):
     """Adapts the sparsity layout of a block-sparse tensor, resulting in a new block-sparse tensor in compressed form

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/distribution.py RENAMED Viewed

@@ -11,6 +11,7 @@ from blksprs.utils.validation import validate_contiguous, validate_dimensions, v
     validate_sparsity, validate_dtype_int, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
            dim: int,
            idx: BlksprsTensor, sparsity_layout_idx: Tensor,
@@ -53,7 +54,7 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
 def gather_forward(x: Tensor, sparsity_layout_x: Tensor, sparsity_reverse_lut_x: Tensor,
                    dim: int, i: Tensor, _: Tensor, sparsity_lut_i: Tensor,
                    sparsity_block_size: int) -> Tensor:
-    output = torch.empty_like(i, dtype=x.dtype)
+    output = torch.zeros_like(i, dtype=x.dtype)
     x_b, x_r, x_c = x.size()
     x_b_s, x_r_s, x_c_s = stride(x)
@@ -100,6 +101,7 @@ def gather_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
     key=[],
+    reset_to_zero=["o"]
 )
 @triton.jit
 def gather_kernel(x,
@@ -247,6 +249,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
                           reduce_op="none", lut=lut)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
                    dim: int,
                    idx: BlksprsTensor,

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/flow.py RENAMED Viewed

@@ -12,7 +12,7 @@ from blksprs.utils.tools import stride, get_autotune_configs
 def flow_pull_forward(x: Tensor, sparsity_layout_o: Tensor,
                       sparsity_lut: Tensor, sparsity_reverse_lut: Tensor,
                       sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    output = torch.empty(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+    output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
                          dtype=x.dtype, device=x.device)
     x_b, x_r, x_c = x.size()
@@ -44,6 +44,7 @@ def flow_pull_forward(x: Tensor, sparsity_layout_o: Tensor,
 @triton.autotune(
     configs=get_autotune_configs(),
     key=[],
+    reset_to_zero=["o"]
 )
 @triton.jit
 def flow_pull_kernel(x,

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/matmul.py RENAMED Viewed

@@ -11,6 +11,7 @@ from blksprs.utils.validation import validate_contiguous, validate_dimensions, v
     validate_sparsity, validate_sparsity_block_size, validate_dtype_float
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def matmul(x: BlksprsTensor, sparsity_layout_x: Tensor,
            y: BlksprsTensor, sparsity_layout_y: Tensor,
            sparsity_layout_output: Tensor,
@@ -59,7 +60,7 @@ def matmul_forward(x: Tensor, y: Tensor,
                    sparsity_layout_y: Tensor, sparsity_reverse_lut_y: Tensor,
                    _: Tensor, sparsity_lut_o: Tensor,
                    sparsity_block_size: int, n_sparse_blocks: int) -> Tensor:
-    output = torch.empty(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
+    output = torch.zeros(size=(n_sparse_blocks, sparsity_block_size, sparsity_block_size),
                          dtype=x.dtype, device=x.device)
     x_b, x_r, x_c = x.size()
@@ -117,6 +118,7 @@ def matmul_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
     key=[],
+    reset_to_zero=["o"]
 )
 @triton.jit
 def matmul_kernel(x,

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/misc/row_wise.py RENAMED Viewed

@@ -10,6 +10,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
                  flag_slice_only: bool = False) -> (BlksprsTensor, Tensor):
     """Computes the row-wise sum of a block-sparse tensor.
@@ -156,6 +157,7 @@ def row_wise_sum_kernel(x,
     tl.atomic_add(o + o_idx, buf, o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
                  flag_slice_only: bool = False) -> (BlksprsTensor, Tensor):
     """Computes the row-wise max of a block-sparse tensor.
@@ -304,6 +306,7 @@ def row_wise_max_kernel(x,
     tl.atomic_max(o + o_idx, buf, o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
                  sparsity_block_size: int) -> BlksprsTensor:
     """For each row in ``y`` adds the value to each value in the corresponding row of the block-sparse tensor ``x``.
@@ -351,7 +354,7 @@ def row_wise_sub(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
 def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
                          sparsity_layout_x_rwm: Tensor, sparsity_reverse_x_lut_rwm: Tensor,
                          y: Tensor, sparsity_block_size: int) -> Tensor:
-    output = torch.empty_like(x)
+    output = torch.zeros_like(x)
     x_b, x_r, x_c = x.size()
     x_b_s, x_r_s, x_c_s = stride(x)
@@ -384,7 +387,8 @@ def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[]
+    key=[],
+    reset_to_zero=["o"]
 )
 @triton.jit
 def kernel_blocksparse_row_wise_add(x,

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/partitioning.py RENAMED Viewed

@@ -8,6 +8,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
           dim: int, sparsity_block_size: int, lut: dict = None) -> (
         BlksprsTensor, Tensor):
@@ -111,6 +112,7 @@ def split_setup_context(ctx, inputs, output):
 split_forward.register_autograd(split_backward, setup_context=split_setup_context)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
           dim: int, sparsity_block_size: int, lut: dict = None) -> (
         BlksprsTensor, Tensor):

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/repeat.py RENAMED Viewed

@@ -8,6 +8,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int, int],
            sparsity_block_size: int, sparsity_layout_output: Tensor = None, lut: dict = None) -> (
         BlksprsTensor, Tensor):
@@ -50,6 +51,7 @@ def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int,
         lut["sparsity_reverse_lut"], sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_o"]
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
                       sparsity_block_size: int, sparsity_layout_output: Tensor = None, lut: dict = None) -> (
         BlksprsTensor, Tensor):

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/softmax.py RENAMED Viewed

@@ -9,9 +9,10 @@ from blksprs.ops.misc.row_wise import row_wise_sum, row_wise_max, row_wise_sub
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride, get_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
-    validate_sparsity, validate_sparsity_block_size
+    validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int, lut: dict = None) -> BlksprsTensor:
     """Computes the softmax of a block-sparse tensor in compressed form.
@@ -32,6 +33,7 @@ def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
     validate_dimensions(x)
     validate_contiguous(x)
+    validate_dtype_float_32(x)
     validate_device(x)
     validate_sparsity(sparsity_block_size, (x, sparsity_layout))
     validate_sparsity_block_size(sparsity_block_size, x)
@@ -49,7 +51,7 @@ def softmax_forward(x: Tensor, sparsity_layout: Tensor,
                     sparsity_lut: Tensor,
                     sparsity_reverse_lut_rws: Tensor,
                     sparsity_block_size: int) -> Tensor:
-    output = torch.empty_like(x)
+    output = torch.zeros_like(x)
     x_b, x_r, x_c = x.size()
     x_b_s, x_r_s, x_c_s = stride(x)
@@ -106,7 +108,7 @@ def softmax_backward(ctx, grad_output):
     s_l_s_b, s_l_s_r, s_l_s_c = sparsity_layout_s.size()
     s_l_s_b_s, s_l_s_r_s, s_l_s_c_s = stride(sparsity_layout_s)
-    grad_x = torch.empty_like(o, dtype=torch.float)
+    grad_x = torch.zeros_like(o, dtype=torch.float)
     triton_grid = lambda meta: [o_b,
                                 triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
@@ -131,7 +133,8 @@ def softmax_backward(ctx, grad_output):
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[]
+    key=[],
+    reset_to_zero=["o"]
 )
 @triton.jit
 def softmax_kernel(x,
@@ -196,7 +199,8 @@ def softmax_kernel(x,
 @triton.autotune(
     configs=get_autotune_configs(),
-    key=[]
+    key=[],
+    reset_to_zero=["o"]
 )
 @triton.jit
 def softmax_kernel_grad(g,

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/ops/transpose.py RENAMED Viewed

@@ -8,6 +8,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def transpose(x: BlksprsTensor, sparsity_layout: Tensor,
               sparsity_block_size: int, lut: dict = None) -> (BlksprsTensor, Tensor):
     """Transposes a block-sparse tensor in compressed form.

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/utils/processing.py RENAMED Viewed

@@ -11,6 +11,7 @@ from blksprs.ops.repeat import repeat
 from blksprs.utils.blksprs_tensor import BlksprsTensor
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
                        linear: nn.Linear, bias: nn.Parameter = None) -> (BlksprsTensor, Tensor):
     # Extract weight and bias
@@ -25,7 +26,8 @@ def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block
     # Apply weights
     sparsity_layout_xw = build_sparsity_layout_matmul_fast(sparsity_layout, sparsity_layout_w_t)
-    xw = matmul(x, sparsity_layout, w_t_bs, sparsity_layout_w_t, sparsity_layout_xw, sparsity_block_size)
+    # TODO At the moment, manual cast is needed. Bug with custom_fwd?
+    xw = matmul(x, sparsity_layout, BlksprsTensor(w_t_bs.to(x.dtype)), sparsity_layout_w_t, sparsity_layout_xw, sparsity_block_size)
     interim = xw
     # Apply bias

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs/utils/validation.py RENAMED Viewed

@@ -26,10 +26,27 @@ def validate_dtype_float(*tensors: Tensor) -> None:
     if _check_skip_validation():
         return
-    for tensor in tensors:
+    dtype = None
+    for i, tensor in enumerate(tensors):
+        if i == 0:
+            dtype = tensor.dtype
         if tensor.dtype != torch.float16 and tensor.dtype != torch.float32:
             raise ValueError("Tensor must have either float16 or float32 dtype")
+        if tensor.dtype != dtype:
+            raise ValueError("Tensors must have same dtype")
+def validate_dtype_float_32(*tensors: Tensor) -> None:
+    if _check_skip_validation():
+        return
+    for tensor in tensors:
+        if tensor.dtype != torch.float32:
+            raise ValueError("Tensor must have float32 dtype")
 def validate_dtype_int(*tensors: Tensor) -> None:
     if _check_skip_validation():

{blksprs-2.0rc2 → blksprs-2.0rc4}/blksprs.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.0rc2
+Version: 2.0rc4
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -27,7 +27,7 @@ Requires-Dist: matplotlib; extra == "test"
   ### News
   🎉 ***Version 2.0 released***. blksprs now supports kernel auto-tuning, JIT compilation, specification of pre-calculated
-  LUTs, and makes use of `torch.library.triton_op()`!
+  LUTs, autocasting, and makes use of `torch.library.triton_op()`!
 ---

{blksprs-2.0rc2 → blksprs-2.0rc4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "blksprs"
-version = "2.0-rc.2"
+version = "2.0-rc.4"
 authors = [{ name = "Felix Schön", email = "schoen@kr.tuwien.ac.at" }]
 description = "A lightweight library for operations on blocksparse matrices in PyTorch."
 readme = "README.md"