PyPI - blksprs - Versions diffs - 2.0rc6__py3-none-any.whl → 2.0rc7__py3-none-any.whl - Mend

blksprs 2.0rc6py3-none-any.whl → 2.0rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

blksprs/ops/distribution.py CHANGED Viewed

@@ -240,7 +240,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
                           reduce_op="none", lut=lut)
-@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
 def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
                    dim: int,
                    idx: BlksprsTensor,

blksprs/ops/misc/broadcast_ops.py CHANGED Viewed

@@ -12,6 +12,7 @@ from blksprs.utils.validation import validate_contiguous, validate_device, \
     validate_sparsity_block_size
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
 def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
                   sparsity_block_size: int) -> BlksprsTensor:
     """Performs a broadcast and subsequent addition of two dense tensors x and y. Returns a block-sparse tensor in

blksprs/ops/misc/row_wise.py CHANGED Viewed

@@ -4,9 +4,9 @@ from torch import Tensor
 from torch._library.triton import wrap_triton, triton_op
 from triton import language as tl
-from blksprs.utils.blksprs_tensor import BlksprsTensor
-from blksprs.utils.tools import stride, get_autocast_min_val
 from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
+from blksprs.utils.blksprs_tensor import BlksprsTensor
+from blksprs.utils.tools import stride
 from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, validate_sparsity, \
     validate_sparsity_block_size
@@ -95,6 +95,7 @@ def row_wise_sum_forward(x: Tensor, sparsity_lut: Tensor,
     return output
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
     key=["sparsity_block_size"],
@@ -175,6 +176,8 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
             of the input and the sparsity layout of the output tensor.
     """
+    # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376
+    x = torch.where(x == -0.0, torch.tensor(0.0), x)
     x = x.contiguous()
     validate_dimensions(x)
@@ -209,7 +212,7 @@ def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
     output = torch.full(size=(n_sparse_blocks_output,
                               sparsity_block_size,
                               1 if flag_slice_only else sparsity_block_size),
-                        fill_value=get_autocast_min_val(),
+                        fill_value=torch.finfo(x.dtype).min,
                         device=x.device)
     x_b, x_r, x_c = x.size()
@@ -238,6 +241,7 @@ def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
     return output
+# noinspection PyUnusedLocal
 @triton.autotune(
     configs=get_autotune_configs(),
     key=["sparsity_block_size"],

blksprs/utils/tools.py CHANGED Viewed

@@ -26,12 +26,3 @@ def stride(x: Tensor):
         return x.size(1) * x.size(2), x.size(2), 1
     else:
         raise NotImplementedError
-def get_autocast_min_val():
-    if torch.is_autocast_enabled():
-        dtype = torch.get_autocast_dtype("cuda")
-    else:
-        dtype = torch.float
-    return torch.finfo(dtype).min

{blksprs-2.0rc6.dist-info → blksprs-2.0rc7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.0rc6
+Version: 2.0rc7
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -108,12 +108,16 @@ library.
 ## Known Limitations and Issues
+- Triton has a bug with `tl.atomix_max()` used for the row-wise max operation.
+  In order to work around this bug a manual conversion of some values is needed, (slightly) negatively impacting
+  performance.
+  Watch the [issue](https://github.com/triton-lang/triton/issues/6376) on Triton's issue tracker for more information.
 - PyTorch's `wrap_triton()` currently does not support config pruning. It thus cannot be used for some of the kernels,
   which could impact graph compilation.
 - There seem to be some issues with autocasting, forcing some operations to manually cast.
 - There will be some slight numerical differences between vanilla and blksprs operations.
-These instabilities are due to Triton and thus cannot be fixed by this library alone.
-However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
+  These instabilities are due to Triton and thus cannot be fixed by this library alone.
+  However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
 ## Usage

{blksprs-2.0rc6.dist-info → blksprs-2.0rc7.dist-info}/RECORD RENAMED Viewed

@@ -2,22 +2,22 @@ blksprs/__init__.py,sha256=OHfpwJCZWGUfpT-DVfC1YSaeZl4aCMNt9CrzMPymywU,1577
 blksprs/layouting/distribution_layout.py,sha256=TkMh_DYKX56Cb8Vq7EHyupMRvzm0XbUNP8QP7afv9wM,5122
 blksprs/layouting/sparsity_layout.py,sha256=6GOjwllDUK9L8jEQNu2i17Pp1BIIQm8fv3xVuiR0zIw,10228
 blksprs/ops/conversion.py,sha256=2zAdbaZ1iP2lisLVeG-k-f571G4HJapADhSwpY0Zd3o,21503
-blksprs/ops/distribution.py,sha256=Gffhd7z85IDm57G8x_v7J2P3ezEVOHEWPD_36Lf8Irs,20453
+blksprs/ops/distribution.py,sha256=6joac_zl3ZnRkPqLPQ0d88r7IbcrWAg0HiV93LOZw-w,20453
 blksprs/ops/flow.py,sha256=UO5ba5TFgVpEyT7r0hnWYw3vhRDpBOxyPHUBeNOAYPs,7935
 blksprs/ops/matmul.py,sha256=02hujXMtFgF7ohepM3v6h9okrfcU-J3mQZV17B-qvh0,12235
 blksprs/ops/partitioning.py,sha256=nAV28f3NtvT4OFvDtnE0A-VxpDQmMXS0pZw4CJwzqGA,9838
 blksprs/ops/repeat.py,sha256=bQpJuwtt8aRdSzxT78lJ8f8fLDhPkYK5UvMfJ-PQrkc,8977
 blksprs/ops/softmax.py,sha256=-NoTf1Cpuku9C99N0LuMydT_ObozWTnZJGDZxseXEXI,12209
 blksprs/ops/transpose.py,sha256=PQKteFnzNAOEC7voO7wh_dq9c54UjCboJz889aBCwKc,4010
-blksprs/ops/misc/broadcast_ops.py,sha256=wBusOtscfGSbtfsCynI1ypr93KuCfVpLX_4b4l8-dck,5811
-blksprs/ops/misc/row_wise.py,sha256=k23p1rizOLS_iRWFhiKiRW6KnR2qxmHfsE8jq0VFfa0,18991
+blksprs/ops/misc/broadcast_ops.py,sha256=DhUbliT9TBT6zlEjutBmY1EAEUPmYOt2mKQ5i46vN1c,5880
+blksprs/ops/misc/row_wise.py,sha256=5u_J8WOTepvf6XtZ8r0lLPofYrI5fGB7mxSmGC81IR0,19167
 blksprs/utils/autotuning.py,sha256=tDfMWklm2rvbo0-ahH81C3Gg0U6LHjPn3d_3pEOzmJs,2053
 blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
 blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
 blksprs/utils/processing.py,sha256=xuu9iDpwTvsqI_WKMSD8QCNuvPnfcKMRcuF2L4Zs6Ts,3808
-blksprs/utils/tools.py,sha256=NusV0H_XPn4ETJTibQwh3bJBqfW12iUrBuk1EfjbAQs,851
+blksprs/utils/tools.py,sha256=3_2IBbd54vVU4-6m2KtAN7qjU6jeF4UfPkbjeFqMpYo,664
 blksprs/utils/validation.py,sha256=G8eQlvJVMKfEX3k2AwBD0A6Ck-gFoRLpLNY6HXsB3fA,4348
-blksprs-2.0rc6.dist-info/METADATA,sha256=gC-91T17-byW4_f0R41hWjwzOr-N1fjp-HIDu3phunU,9179
-blksprs-2.0rc6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-blksprs-2.0rc6.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
-blksprs-2.0rc6.dist-info/RECORD,,
+blksprs-2.0rc7.dist-info/METADATA,sha256=ER9DHdVeYUZUsjE-2bEB9fePw0FVI1vknwPNrj7mDPE,9509
+blksprs-2.0rc7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+blksprs-2.0rc7.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
+blksprs-2.0rc7.dist-info/RECORD,,

{blksprs-2.0rc6.dist-info → blksprs-2.0rc7.dist-info}/WHEEL RENAMED Viewed

File without changes

{blksprs-2.0rc6.dist-info → blksprs-2.0rc7.dist-info}/top_level.txt RENAMED Viewed

File without changes

blksprs 2.0rc6__py3-none-any.whl → 2.0rc7__py3-none-any.whl

blksprs 2.0rc6py3-none-any.whl → 2.0rc7py3-none-any.whl