PyPI - blksprs - Versions diffs - 2.0rc4__py3-none-any.whl → 2.0rc7__py3-none-any.whl - Mend

blksprs 2.0rc4py3-none-any.whl → 2.0rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

blksprs/layouting/distribution_layout.py +11 -15
blksprs/layouting/sparsity_layout.py +26 -31
blksprs/ops/conversion.py +45 -63
blksprs/ops/distribution.py +38 -57
blksprs/ops/flow.py +22 -33
blksprs/ops/matmul.py +19 -20
blksprs/ops/misc/broadcast_ops.py +15 -19
blksprs/ops/misc/row_wise.py +39 -54
blksprs/ops/softmax.py +30 -44
blksprs/utils/autotuning.py +78 -0
blksprs/utils/tools.py +0 -28
blksprs/utils/validation.py +3 -0
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/METADATA +18 -5
blksprs-2.0rc7.dist-info/RECORD +23 -0
blksprs-2.0rc4.dist-info/RECORD +0 -22
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/WHEEL +0 -0
{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/top_level.txt +0 -0

blksprs/utils/autotuning.py ADDED Viewed

@@ -0,0 +1,78 @@
+import os
+blksprs_autotune_mode = os.getenv("BLKSPRS_AUTOTUNE", "DEFAULT")
+if blksprs_autotune_mode == "TEST":
+    autotune_parameters = [
+        (16, 3, 8),
+        (32, 3, 8),
+        (64, 3, 8),
+    ]
+elif blksprs_autotune_mode == "DEFAULT":
+    autotune_parameters = [
+        (16, 3, 8),
+        (16, 4, 4),
+        (16, 5, 2),
+        (32, 3, 8),
+        (32, 4, 4),
+        (32, 5, 2),
+        (64, 3, 8),
+        (64, 4, 4),
+        (64, 5, 2),
+        (128, 3, 8),
+        (128, 4, 4),
+        (128, 5, 2),
+    ]
+else:
+    raise NotImplementedError(f"Unknown autotune mode: {blksprs_autotune_mode}")
+import torch
+import triton
+def prune_autotune_configs(autotune_configs, kernel_args, **kwargs):
+    sparsity_block_size = kernel_args["sparsity_block_size"]
+    pruned_configs = []
+    for config in autotune_configs:
+        if config.kwargs["TRITON_BLOCK_SIZE"] <= sparsity_block_size:
+            pruned_configs.append(config)
+    assert len(pruned_configs) > 0, f"No valid autotune configs found for sparsity block size {sparsity_block_size}"
+    return pruned_configs
+def prune_autotune_configs_conversion(autotune_configs, kernel_args, **kwargs):
+    sparsity_block_size_from = kernel_args["sparsity_block_size_from"]
+    sparsity_block_size_to = kernel_args["sparsity_block_size_to"]
+    sparsity_block_size = min(sparsity_block_size_from, sparsity_block_size_to)
+    pruned_configs = []
+    for config in autotune_configs:
+        if config.kwargs["TRITON_BLOCK_SIZE"] <= sparsity_block_size:
+            pruned_configs.append(config)
+    assert len(pruned_configs) > 0, f"No valid autotune configs found for sparsity block size {sparsity_block_size}"
+    return pruned_configs
+@torch.compile
+def get_autotune_configs():
+    global autotune_parameters
+    autotune_configs = []
+    for block_size, num_stages, num_warps in autotune_parameters:
+        autotune_configs.append(
+            triton.Config({"TRITON_BLOCK_SIZE": block_size}, num_stages=num_stages, num_warps=num_warps))
+    return autotune_configs

blksprs/utils/tools.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import torch
-import triton
 from torch import Tensor, Size
 # Capture scalar outputs for JIT compilation
@@ -27,30 +26,3 @@ def stride(x: Tensor):
         return x.size(1) * x.size(2), x.size(2), 1
     else:
         raise NotImplementedError
-@torch.compile
-def get_autotune_configs():
-    configs = []
-    config_parameters = [
-        (16, 3, 8),
-        (16, 4, 4),
-        (16, 5, 2),
-        (32, 3, 8),
-        (32, 4, 4),
-        (32, 5, 2),
-        (64, 3, 8),
-        (64, 4, 4),
-        (64, 5, 2),
-        (128, 3, 8),
-        (128, 4, 4),
-        (128, 5, 2),
-    ]
-    for block_size, num_stages, num_warps in config_parameters:
-        configs.append(triton.Config({"TRITON_BLOCK_SIZE": block_size}, num_stages=num_stages, num_warps=num_warps))
-    return configs

blksprs/utils/validation.py CHANGED Viewed

@@ -113,6 +113,9 @@ def validate_sparsity_block_size(sparsity_block_size: int, *tensors):
     if _check_skip_validation():
         return
+    if not sparsity_block_size >= 16:
+        raise ValueError("Sparsity block size must be at least 16")
     if not (sparsity_block_size & (sparsity_block_size - 1)) == 0:
         raise ValueError("Sparsity block size must be a power of 2")

{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.0rc4
+Version: 2.0rc7
 Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -24,10 +24,10 @@ Requires-Dist: matplotlib; extra == "test"
 ## Overview
-  ### News
-  🎉 ***Version 2.0 released***. blksprs now supports kernel auto-tuning, JIT compilation, specification of pre-calculated
-  LUTs, autocasting, and makes use of `torch.library.triton_op()`!
+### News
+🎉 ***Version 2.0 released***. blksprs now supports kernel auto-tuning, JIT compilation, specification of pre-calculated
+LUTs, autocasting, and makes use of `torch.library.triton_op()`!
 ---
@@ -106,6 +106,19 @@ We also encourage [pull requests](https://github.com/FelixSchoen/blksprs/pulls).
 It might be that this changes with future projects, but as of March 2025, we are content with the current state of the
 library.
+## Known Limitations and Issues
+- Triton has a bug with `tl.atomix_max()` used for the row-wise max operation.
+  In order to work around this bug a manual conversion of some values is needed, (slightly) negatively impacting
+  performance.
+  Watch the [issue](https://github.com/triton-lang/triton/issues/6376) on Triton's issue tracker for more information.
+- PyTorch's `wrap_triton()` currently does not support config pruning. It thus cannot be used for some of the kernels,
+  which could impact graph compilation.
+- There seem to be some issues with autocasting, forcing some operations to manually cast.
+- There will be some slight numerical differences between vanilla and blksprs operations.
+  These instabilities are due to Triton and thus cannot be fixed by this library alone.
+  However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
 ## Usage
 We provide an example below to demonstrate the usage of the library.

blksprs-2.0rc7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+blksprs/__init__.py,sha256=OHfpwJCZWGUfpT-DVfC1YSaeZl4aCMNt9CrzMPymywU,1577
+blksprs/layouting/distribution_layout.py,sha256=TkMh_DYKX56Cb8Vq7EHyupMRvzm0XbUNP8QP7afv9wM,5122
+blksprs/layouting/sparsity_layout.py,sha256=6GOjwllDUK9L8jEQNu2i17Pp1BIIQm8fv3xVuiR0zIw,10228
+blksprs/ops/conversion.py,sha256=2zAdbaZ1iP2lisLVeG-k-f571G4HJapADhSwpY0Zd3o,21503
+blksprs/ops/distribution.py,sha256=6joac_zl3ZnRkPqLPQ0d88r7IbcrWAg0HiV93LOZw-w,20453
+blksprs/ops/flow.py,sha256=UO5ba5TFgVpEyT7r0hnWYw3vhRDpBOxyPHUBeNOAYPs,7935
+blksprs/ops/matmul.py,sha256=02hujXMtFgF7ohepM3v6h9okrfcU-J3mQZV17B-qvh0,12235
+blksprs/ops/partitioning.py,sha256=nAV28f3NtvT4OFvDtnE0A-VxpDQmMXS0pZw4CJwzqGA,9838
+blksprs/ops/repeat.py,sha256=bQpJuwtt8aRdSzxT78lJ8f8fLDhPkYK5UvMfJ-PQrkc,8977
+blksprs/ops/softmax.py,sha256=-NoTf1Cpuku9C99N0LuMydT_ObozWTnZJGDZxseXEXI,12209
+blksprs/ops/transpose.py,sha256=PQKteFnzNAOEC7voO7wh_dq9c54UjCboJz889aBCwKc,4010
+blksprs/ops/misc/broadcast_ops.py,sha256=DhUbliT9TBT6zlEjutBmY1EAEUPmYOt2mKQ5i46vN1c,5880
+blksprs/ops/misc/row_wise.py,sha256=5u_J8WOTepvf6XtZ8r0lLPofYrI5fGB7mxSmGC81IR0,19167
+blksprs/utils/autotuning.py,sha256=tDfMWklm2rvbo0-ahH81C3Gg0U6LHjPn3d_3pEOzmJs,2053
+blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
+blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
+blksprs/utils/processing.py,sha256=xuu9iDpwTvsqI_WKMSD8QCNuvPnfcKMRcuF2L4Zs6Ts,3808
+blksprs/utils/tools.py,sha256=3_2IBbd54vVU4-6m2KtAN7qjU6jeF4UfPkbjeFqMpYo,664
+blksprs/utils/validation.py,sha256=G8eQlvJVMKfEX3k2AwBD0A6Ck-gFoRLpLNY6HXsB3fA,4348
+blksprs-2.0rc7.dist-info/METADATA,sha256=ER9DHdVeYUZUsjE-2bEB9fePw0FVI1vknwPNrj7mDPE,9509
+blksprs-2.0rc7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+blksprs-2.0rc7.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
+blksprs-2.0rc7.dist-info/RECORD,,

blksprs-2.0rc4.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-blksprs/__init__.py,sha256=OHfpwJCZWGUfpT-DVfC1YSaeZl4aCMNt9CrzMPymywU,1577
-blksprs/layouting/distribution_layout.py,sha256=0glIteoY5oDkiEu5rjLIC-BB_oC4sa3rFWVkohsAG00,5329
-blksprs/layouting/sparsity_layout.py,sha256=ZUhJm1jJn-npiJWFjsVyzjXDQOp8z-Wjjv0MPQOXRvg,10490
-blksprs/ops/conversion.py,sha256=FsujfUH3R8ijSti_ifsTQihB0djK8Snny2fbGRruzRw,22459
-blksprs/ops/distribution.py,sha256=CTcDcUx8vwe-9F9Y25B7ea7tcvy5gR2Pyk0Ko48MWFo,21514
-blksprs/ops/flow.py,sha256=MY1ypGLIAlkZty5iQINip5mDIQxu9pP1D1dIae4sKJg,8433
-blksprs/ops/matmul.py,sha256=xFxWSCy9NwPDTxfSUOyQU_X4sHp3HrJtohlUCc1WO8g,12028
-blksprs/ops/partitioning.py,sha256=nAV28f3NtvT4OFvDtnE0A-VxpDQmMXS0pZw4CJwzqGA,9838
-blksprs/ops/repeat.py,sha256=bQpJuwtt8aRdSzxT78lJ8f8fLDhPkYK5UvMfJ-PQrkc,8977
-blksprs/ops/softmax.py,sha256=PdRPAkCJahtGBO5W-aqF_Dxi9X8RJ621XmYfVo2I0OM,12968
-blksprs/ops/transpose.py,sha256=PQKteFnzNAOEC7voO7wh_dq9c54UjCboJz889aBCwKc,4010
-blksprs/ops/misc/broadcast_ops.py,sha256=lZ5bBIftUKffzeYz77SWB1xmtZTRGMvjF-tG9rqkOXA,6018
-blksprs/ops/misc/row_wise.py,sha256=FOy73-I5_OuCugiq0xQxtre9-ytfBQPDaXQv8tssuXg,19764
-blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
-blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
-blksprs/utils/processing.py,sha256=xuu9iDpwTvsqI_WKMSD8QCNuvPnfcKMRcuF2L4Zs6Ts,3808
-blksprs/utils/tools.py,sha256=RL18P4NAj7d8gXTTKbMZt4SHCynsw1wPu9yvlrnBQlo,1220
-blksprs/utils/validation.py,sha256=7ks9hdNKbov1JE9y1bpnIfjWCVhqINTZOIZPi6d7k8E,4241
-blksprs-2.0rc4.dist-info/METADATA,sha256=uM3Ssh-i170VnuaaPf-kjM4EwztirvAXlU7xINY6YhM,8614
-blksprs-2.0rc4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-blksprs-2.0rc4.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
-blksprs-2.0rc4.dist-info/RECORD,,

{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/WHEEL RENAMED Viewed

File without changes

{blksprs-2.0rc4.dist-info → blksprs-2.0rc7.dist-info}/top_level.txt RENAMED Viewed

File without changes

blksprs 2.0rc4__py3-none-any.whl → 2.0rc7__py3-none-any.whl

blksprs 2.0rc4py3-none-any.whl → 2.0rc7py3-none-any.whl