PyPI - blksprs - Versions diffs - 2.0rc1__py3-none-any.whl → 2.0rc3__py3-none-any.whl - Mend

blksprs 2.0rc1py3-none-any.whl → 2.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

blksprs/layouting/sparsity_layout.py +3 -0
blksprs/ops/conversion.py +3 -0
blksprs/ops/distribution.py +2 -0
blksprs/ops/matmul.py +4 -0
blksprs/ops/misc/row_wise.py +3 -0
blksprs/ops/partitioning.py +2 -0
blksprs/ops/repeat.py +2 -0
blksprs/ops/softmax.py +3 -1
blksprs/ops/transpose.py +1 -0
blksprs/utils/processing.py +3 -1
blksprs/utils/validation.py +19 -2
{blksprs-2.0rc1.dist-info → blksprs-2.0rc3.dist-info}/METADATA +2 -2
blksprs-2.0rc3.dist-info/RECORD +22 -0
{blksprs-2.0rc1.dist-info → blksprs-2.0rc3.dist-info}/WHEEL +1 -1
blksprs-2.0rc1.dist-info/RECORD +0 -22
{blksprs-2.0rc1.dist-info → blksprs-2.0rc3.dist-info}/top_level.txt +0 -0

blksprs/layouting/sparsity_layout.py CHANGED Viewed

@@ -12,6 +12,7 @@ from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous, validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def build_sparsity_layout(x: Tensor, sparsity_block_size: int) -> Tensor:
     """Builds the sparsity layout of a dense tensor in regular form covering its sparse blocks.
@@ -199,6 +200,7 @@ def build_sparsity_layout_adaption_kernel(x,
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def build_sparsity_layout_matmul(sparsity_layout_x: Tensor, sparsity_layout_y: Tensor) -> Tensor:
     """Builds the precise sparsity layout of the result of a matrix multiplication between the two input tensors.
@@ -213,6 +215,7 @@ def build_sparsity_layout_matmul(sparsity_layout_x: Tensor, sparsity_layout_y: T
     return torch.matmul(sparsity_layout_x.to(torch.float), sparsity_layout_y.to(torch.float)).to(torch.bool)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def build_sparsity_layout_matmul_fast(sparsity_layout_x: Tensor, sparsity_layout_y: Tensor):
     """Builds the approximate sparsity layout of the result of a matrix multiplication between the two input tensors.

blksprs/ops/conversion.py CHANGED Viewed

@@ -18,6 +18,7 @@ def to_blksprs(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int) ->
     return to_sparse(x, sparsity_layout, sparsity_block_size)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def to_sparse(x: Tensor, sparsity_layout: Tensor,
               sparsity_block_size: int, lut: dict = None) -> BlksprsTensor:
     """Converts a block-sparse tensor in regular form to a block-sparse tensor in compressed form based on the given
@@ -175,6 +176,7 @@ def from_blksprs(x: BlksprsTensor, sparsity_layout: Tensor,
     return to_dense(x, sparsity_layout, sparsity_block_size, fill_value=fill_value, lut=lut)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def to_dense(x: BlksprsTensor, sparsity_layout: Tensor,
              sparsity_block_size: int, fill_value: float = 0, lut: dict = None) -> Tensor:
     """Converts a block-sparse tensor in compressed form to a block-sparse tensor in regular form based on the given
@@ -326,6 +328,7 @@ def to_dense_setup_context(ctx, inputs, output):
 to_dense_forward.register_autograd(to_dense_backward, setup_context=to_dense_setup_context)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_size_from: int,
                  sparsity_block_size_to: int, sparsity_layout_to: Tensor = None) -> (BlksprsTensor, Tensor):
     """Adapts the sparsity layout of a block-sparse tensor, resulting in a new block-sparse tensor in compressed form

blksprs/ops/distribution.py CHANGED Viewed

@@ -11,6 +11,7 @@ from blksprs.utils.validation import validate_contiguous, validate_dimensions, v
     validate_sparsity, validate_dtype_int, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
            dim: int,
            idx: BlksprsTensor, sparsity_layout_idx: Tensor,
@@ -247,6 +248,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
                           reduce_op="none", lut=lut)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
                    dim: int,
                    idx: BlksprsTensor,

blksprs/ops/matmul.py CHANGED Viewed

@@ -11,6 +11,7 @@ from blksprs.utils.validation import validate_contiguous, validate_dimensions, v
     validate_sparsity, validate_sparsity_block_size, validate_dtype_float
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def matmul(x: BlksprsTensor, sparsity_layout_x: Tensor,
            y: BlksprsTensor, sparsity_layout_y: Tensor,
            sparsity_layout_output: Tensor,
@@ -205,6 +206,9 @@ def matmul_kernel(x,
             # Perform matrix multiplication
             buf += tl.dot(blk_x, blk_y)
+    # Cast buffer
+    buf = buf.to(o.dtype.element_ty)
     # Store output
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * val_tbs + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +

blksprs/ops/misc/row_wise.py CHANGED Viewed

@@ -10,6 +10,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
                  flag_slice_only: bool = False) -> (BlksprsTensor, Tensor):
     """Computes the row-wise sum of a block-sparse tensor.
@@ -156,6 +157,7 @@ def row_wise_sum_kernel(x,
     tl.atomic_add(o + o_idx, buf, o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
                  flag_slice_only: bool = False) -> (BlksprsTensor, Tensor):
     """Computes the row-wise max of a block-sparse tensor.
@@ -304,6 +306,7 @@ def row_wise_max_kernel(x,
     tl.atomic_max(o + o_idx, buf, o_msk)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
                  sparsity_block_size: int) -> BlksprsTensor:
     """For each row in ``y`` adds the value to each value in the corresponding row of the block-sparse tensor ``x``.

blksprs/ops/partitioning.py CHANGED Viewed

@@ -8,6 +8,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
           dim: int, sparsity_block_size: int, lut: dict = None) -> (
         BlksprsTensor, Tensor):
@@ -111,6 +112,7 @@ def split_setup_context(ctx, inputs, output):
 split_forward.register_autograd(split_backward, setup_context=split_setup_context)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
           dim: int, sparsity_block_size: int, lut: dict = None) -> (
         BlksprsTensor, Tensor):

blksprs/ops/repeat.py CHANGED Viewed

@@ -8,6 +8,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int, int],
            sparsity_block_size: int, sparsity_layout_output: Tensor = None, lut: dict = None) -> (
         BlksprsTensor, Tensor):
@@ -50,6 +51,7 @@ def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int,
         lut["sparsity_reverse_lut"], sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_o"]
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
                       sparsity_block_size: int, sparsity_layout_output: Tensor = None, lut: dict = None) -> (
         BlksprsTensor, Tensor):

blksprs/ops/softmax.py CHANGED Viewed

@@ -9,9 +9,10 @@ from blksprs.ops.misc.row_wise import row_wise_sum, row_wise_max, row_wise_sub
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride, get_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
-    validate_sparsity, validate_sparsity_block_size
+    validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int, lut: dict = None) -> BlksprsTensor:
     """Computes the softmax of a block-sparse tensor in compressed form.
@@ -32,6 +33,7 @@ def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
     validate_dimensions(x)
     validate_contiguous(x)
+    validate_dtype_float_32(x)
     validate_device(x)
     validate_sparsity(sparsity_block_size, (x, sparsity_layout))
     validate_sparsity_block_size(sparsity_block_size, x)

blksprs/ops/transpose.py CHANGED Viewed

@@ -8,6 +8,7 @@ from blksprs.utils.validation import validate_dimensions, validate_contiguous, v
     validate_sparsity, validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def transpose(x: BlksprsTensor, sparsity_layout: Tensor,
               sparsity_block_size: int, lut: dict = None) -> (BlksprsTensor, Tensor):
     """Transposes a block-sparse tensor in compressed form.

blksprs/utils/processing.py CHANGED Viewed

@@ -11,6 +11,7 @@ from blksprs.ops.repeat import repeat
 from blksprs.utils.blksprs_tensor import BlksprsTensor
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int,
                        linear: nn.Linear, bias: nn.Parameter = None) -> (BlksprsTensor, Tensor):
     # Extract weight and bias
@@ -25,7 +26,8 @@ def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block
     # Apply weights
     sparsity_layout_xw = build_sparsity_layout_matmul_fast(sparsity_layout, sparsity_layout_w_t)
-    xw = matmul(x, sparsity_layout, w_t_bs, sparsity_layout_w_t, sparsity_layout_xw, sparsity_block_size)
+    # TODO At the moment, manual cast is needed. Bug with custom_fwd?
+    xw = matmul(x, sparsity_layout, BlksprsTensor(w_t_bs.to(x.dtype)), sparsity_layout_w_t, sparsity_layout_xw, sparsity_block_size)
     interim = xw
     # Apply bias

blksprs/utils/validation.py CHANGED Viewed

@@ -26,6 +26,23 @@ def validate_dtype_float(*tensors: Tensor) -> None:
     if _check_skip_validation():
         return
+    dtype = None
+    for i, tensor in enumerate(tensors):
+        if i == 0:
+            dtype = tensor.dtype
+        if tensor.dtype != torch.float16 and tensor.dtype != torch.float32:
+            raise ValueError("Tensor must have either float16 or float32 dtype")
+        if tensor.dtype != dtype:
+            raise ValueError("Tensors must have same dtype")
+def validate_dtype_float_32(*tensors: Tensor) -> None:
+    if _check_skip_validation():
+        return
     for tensor in tensors:
         if tensor.dtype != torch.float32:
             raise ValueError("Tensor must have float32 dtype")
@@ -38,7 +55,7 @@ def validate_dtype_int(*tensors: Tensor) -> None:
     for tensor in tensors:
         if (tensor.dtype !=
                 torch.int32 and tensor.dtype != torch.int64):
-            raise ValueError("Tensor must have int32 or int64 dtype")
+            raise ValueError("Tensor must have either int32 or int64 dtype")
 def validate_device(*tensors: Tensor) -> None:
@@ -51,7 +68,7 @@ def validate_device(*tensors: Tensor) -> None:
         if i == 0:
             device = tensor.device
-            if not device.type == 'cuda':
+            if not device.type == "cuda":
                 raise ValueError("Tensors must be on GPU")
         if tensor.device != device:

{blksprs-2.0rc1.dist-info → blksprs-2.0rc3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.0rc1
+Version: 2.0rc3
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -27,7 +27,7 @@ Requires-Dist: matplotlib; extra == "test"
   ### News
   🎉 ***Version 2.0 released***. blksprs now supports kernel auto-tuning, JIT compilation, specification of pre-calculated
-  LUTs, and makes use of `torch.library.triton_op()`!
+  LUTs, autocasting, and makes use of `torch.library.triton_op()`!
 ---

blksprs-2.0rc3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,22 @@
+blksprs/__init__.py,sha256=OHfpwJCZWGUfpT-DVfC1YSaeZl4aCMNt9CrzMPymywU,1577
+blksprs/layouting/distribution_layout.py,sha256=0glIteoY5oDkiEu5rjLIC-BB_oC4sa3rFWVkohsAG00,5329
+blksprs/layouting/sparsity_layout.py,sha256=ZUhJm1jJn-npiJWFjsVyzjXDQOp8z-Wjjv0MPQOXRvg,10490
+blksprs/ops/conversion.py,sha256=pdoWhqEbgsB4STr_NjDcuLUlzSGdYCMaGrW7IOSfxiA,22411
+blksprs/ops/distribution.py,sha256=hLpKUoS553jM_F13WyLNNf73PM1yLqgDTkZUdW_pleo,21490
+blksprs/ops/flow.py,sha256=G8L_sMAWIM77gv-YLJtyutEzXqyaaofnSX2QKvmDr44,8409
+blksprs/ops/matmul.py,sha256=t9JUujkG-sGu4iyM4bjgrZJeNtMk3l8tk7rzYvWBCR8,12004
+blksprs/ops/partitioning.py,sha256=nAV28f3NtvT4OFvDtnE0A-VxpDQmMXS0pZw4CJwzqGA,9838
+blksprs/ops/repeat.py,sha256=bQpJuwtt8aRdSzxT78lJ8f8fLDhPkYK5UvMfJ-PQrkc,8977
+blksprs/ops/softmax.py,sha256=-9wFmQpnnCGK-xOZe-5L_cCxl5Cn_GNc9QGvhSQbRe4,12918
+blksprs/ops/transpose.py,sha256=PQKteFnzNAOEC7voO7wh_dq9c54UjCboJz889aBCwKc,4010
+blksprs/ops/misc/broadcast_ops.py,sha256=lZ5bBIftUKffzeYz77SWB1xmtZTRGMvjF-tG9rqkOXA,6018
+blksprs/ops/misc/row_wise.py,sha256=NcnLaXlPM7aQSoKXHYInao8F0xSQHixbVz-xebF5Bx0,19739
+blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
+blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
+blksprs/utils/processing.py,sha256=xuu9iDpwTvsqI_WKMSD8QCNuvPnfcKMRcuF2L4Zs6Ts,3808
+blksprs/utils/tools.py,sha256=RL18P4NAj7d8gXTTKbMZt4SHCynsw1wPu9yvlrnBQlo,1220
+blksprs/utils/validation.py,sha256=7ks9hdNKbov1JE9y1bpnIfjWCVhqINTZOIZPi6d7k8E,4241
+blksprs-2.0rc3.dist-info/METADATA,sha256=58xKs5zAesWFMPGu4d0jLPth4yUNS95MGPqqMpn-syM,8614
+blksprs-2.0rc3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+blksprs-2.0rc3.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
+blksprs-2.0rc3.dist-info/RECORD,,

{blksprs-2.0rc1.dist-info → blksprs-2.0rc3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (77.0.3)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

blksprs-2.0rc1.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-blksprs/__init__.py,sha256=OHfpwJCZWGUfpT-DVfC1YSaeZl4aCMNt9CrzMPymywU,1577
-blksprs/layouting/distribution_layout.py,sha256=0glIteoY5oDkiEu5rjLIC-BB_oC4sa3rFWVkohsAG00,5329
-blksprs/layouting/sparsity_layout.py,sha256=UzMcdW7l4zoiLB_LMEbBR1JBdqVSgINDGYvoCYIOulk,10283
-blksprs/ops/conversion.py,sha256=_JKOovDZOmYJLcurJGhgNt5iQB9kOKp3fufFxD8QCZs,22204
-blksprs/ops/distribution.py,sha256=5gE19kPQGQljVbRpDZeqNaOe8ehRhxdQS7PiJp6mMug,21352
-blksprs/ops/flow.py,sha256=G8L_sMAWIM77gv-YLJtyutEzXqyaaofnSX2QKvmDr44,8409
-blksprs/ops/matmul.py,sha256=YAurJcXa_39gRdh2nWUOmbhm8h99arLoO-SN-l134II,11879
-blksprs/ops/partitioning.py,sha256=AooYZOw0oZgA9zXSu09O60hkJcnpWT1OTosr2T2wdQo,9700
-blksprs/ops/repeat.py,sha256=qty0qIFcfiWzROV2A2FB2KiPCC2Pe4q5TwJyGuDBAQE,8839
-blksprs/ops/softmax.py,sha256=eaZ8pfCpNZCX6Gk5Tk-lhNIrBQDhvfHqNNPltqxp91k,12793
-blksprs/ops/transpose.py,sha256=30pGCSjZs42Sg6TEXUdJNCDgmlN1n8aN88uNbV5wOtA,3941
-blksprs/ops/misc/broadcast_ops.py,sha256=lZ5bBIftUKffzeYz77SWB1xmtZTRGMvjF-tG9rqkOXA,6018
-blksprs/ops/misc/row_wise.py,sha256=iwOrHU8HiJGxq2hEmgJGZ60asRm72WLi10-PrpNrdeQ,19532
-blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
-blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
-blksprs/utils/processing.py,sha256=WLuMJQ8v-YovXwcDjhlDn3N31WMZXrtyeeyKSgq_zn4,3642
-blksprs/utils/tools.py,sha256=RL18P4NAj7d8gXTTKbMZt4SHCynsw1wPu9yvlrnBQlo,1220
-blksprs/utils/validation.py,sha256=_Ee6bqu7CxdYLFSy4WZOFoXJgd0p_RBMumCwGCk2_Hw,3763
-blksprs-2.0rc1.dist-info/METADATA,sha256=zXzVOvuwgYSyx-lCBycdFvRUmHUD_qYbK8sFkKWZnp8,8601
-blksprs-2.0rc1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
-blksprs-2.0rc1.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
-blksprs-2.0rc1.dist-info/RECORD,,

{blksprs-2.0rc1.dist-info → blksprs-2.0rc3.dist-info}/top_level.txt RENAMED Viewed

File without changes

blksprs 2.0rc1__py3-none-any.whl → 2.0rc3__py3-none-any.whl

blksprs 2.0rc1py3-none-any.whl → 2.0rc3py3-none-any.whl