PyPI - blksprs - Versions diffs - 2.1.7__tar.gz → 2.1.9__tar.gz - Mend

blksprs 2.1.7tar.gz → 2.1.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{blksprs-2.1.7 → blksprs-2.1.9}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.1.7
+Version: 2.1.9
 Summary: A lightweight library for operations on block-sparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
 Project-URL: Bugtracker, https://github.com/FelixSchoen/blksprs/issues
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
-Requires-Dist: torch
+Requires-Dist: torch>=2.8.0
 Requires-Dist: numpy
 Provides-Extra: test
 Requires-Dist: pytest; extra == "test"
@@ -102,7 +102,7 @@ We will continue to maintain the library and fix any issues that arise.
 Should you find any bugs please open an [issue](https://github.com/FelixSchoen/blksprs/issues).
 We also encourage [pull requests](https://github.com/FelixSchoen/blksprs/pulls).
-It might be that this changes with future projects, but as of June 2025, we are content with the current state of the
+It might be that this changes with future projects, but as of August 2025, we are content with the current state of the
 library.
 ## Known Limitations and Issues

{blksprs-2.1.7 → blksprs-2.1.9}/README.md RENAMED Viewed

@@ -83,7 +83,7 @@ We will continue to maintain the library and fix any issues that arise.
 Should you find any bugs please open an [issue](https://github.com/FelixSchoen/blksprs/issues).
 We also encourage [pull requests](https://github.com/FelixSchoen/blksprs/pulls).
-It might be that this changes with future projects, but as of June 2025, we are content with the current state of the
+It might be that this changes with future projects, but as of August 2025, we are content with the current state of the
 library.
 ## Known Limitations and Issues

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/__init__.py RENAMED Viewed

@@ -1,6 +1,14 @@
-from blksprs.utils.blksprs_tensor import BlksprsTensor
+# Settings
+import torch
+# Capture scalar outputs for JIT compilation
+torch._dynamo.config.capture_scalar_outputs = True
+# Set version
+__version__ = "2.1.9"
-__version__ = "2.1.7"
+# Imports
+from blksprs.utils.blksprs_tensor import BlksprsTensor
 class ops:

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/layouting/distribution_layout.py RENAMED Viewed

@@ -7,9 +7,9 @@ from torch._library import triton_op
 from torch._library.triton import wrap_triton
 from triton import language as tl
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride
-from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous
@@ -98,22 +98,25 @@ def build_distribution_layout_kernel(i,
     # Get position of current sparsity block consisting of its batch, row, and column index
     spa_bat_i_idx = (pid_blk * s_lut_i_r_s + 0 * s_lut_i_c_s)
-    spa_bat_i_msk = (spa_bat_i_idx >= 0 and spa_bat_i_idx < s_lut_i_r * s_lut_i_r_s)
+    spa_bat_i_msk = ((spa_bat_i_idx >= 0) &
+                     (spa_bat_i_idx < s_lut_i_r * s_lut_i_r_s))
     spa_bat_i = tl.load(s_lut_i + spa_bat_i_idx, mask=spa_bat_i_msk)
     spa_row_i_idx = (pid_blk * s_lut_i_r_s + 1 * s_lut_i_c_s)
-    spa_row_i_msk = (spa_row_i_idx >= 0 and spa_row_i_idx < s_lut_i_r * s_lut_i_r_s)
+    spa_row_i_msk = ((spa_row_i_idx >= 0) &
+                     (spa_row_i_idx < s_lut_i_r * s_lut_i_r_s))
     spa_row_i = tl.load(s_lut_i + spa_row_i_idx, mask=spa_row_i_msk)
     spa_col_i_idx = (pid_blk * s_lut_i_r_s + 2 * s_lut_i_c_s)
-    spa_col_i_msk = (spa_col_i_idx >= 0 and spa_col_i_idx < s_lut_i_r * s_lut_i_r_s)
+    spa_col_i_msk = ((spa_col_i_idx >= 0) &
+                     (spa_col_i_idx < s_lut_i_r * s_lut_i_r_s))
     spa_col_i = tl.load(s_lut_i + spa_col_i_idx, mask=spa_col_i_msk)
     blk_i_idx = (pid_blk * i_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
-    blk_i_msk = (blk_i_idx >= 0 and
-                 blk_i_idx < i_b * i_b_s)
+    blk_i_msk = ((blk_i_idx >= 0) &
+                 (blk_i_idx < i_b * i_b_s))
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk)
     dst_bat_idx = tl.full((TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), spa_bat_i, dtype=tl.int32)
@@ -131,6 +134,6 @@ def build_distribution_layout_kernel(i,
     blk_o_idx = ((dst_bat_idx * o_b_s) +
                  (dst_row_idx * o_r_s) +
                  (dst_col_idx * o_c_s))
-    blk_o_msk = (blk_o_idx >= 0 and
-                 blk_o_idx < o_b * o_b_s)
+    blk_o_msk = ((blk_o_idx >= 0) &
+                 (blk_o_idx < o_b * o_b_s))
     tl.store(o + blk_o_idx, blk_v, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/layouting/sparsity_layout.py RENAMED Viewed

@@ -6,9 +6,9 @@ from torch import Tensor
 from torch._library.triton import wrap_triton, triton_op
 from triton import language as tl
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride
-from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
 from blksprs.utils.validation import validate_dimensions, validate_device, \
     validate_contiguous, validate_sparsity, validate_sparsity_block_size
@@ -79,8 +79,8 @@ def build_sparsity_layout_kernel(x,
     blk_x_idx = (pid_bat * x_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and
-                 blk_x_idx < x_b * x_b_s)
+    blk_x_msk = ((blk_x_idx >= 0) &
+                 (blk_x_idx < x_b * x_b_s))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store sparsity layout value
@@ -88,7 +88,8 @@ def build_sparsity_layout_kernel(x,
         blk_o_idx = (pid_bat * o_b_s +
                      (((pid_row * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_r_s +
                       ((pid_col * TRITON_BLOCK_SIZE) // sparsity_block_size) * o_c_s))
-        blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx >= 0) &
+                     (blk_o_idx < o_b * o_b_s))
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)
@@ -178,23 +179,26 @@ def build_sparsity_layout_adaption_kernel(x,
     # Get sparsity index of current output block consisting of its batch, row, and column index
     spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
-    spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
+    spa_bat_msk = ((spa_bat_idx >= 0) &
+                   (spa_bat_idx < s_lut_r * s_lut_r_s))
     spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
     spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
-    spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
+    spa_row_msk = ((spa_row_idx >= 0) &
+                   (spa_row_idx < s_lut_r * s_lut_r_s))
     spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
     spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
-    spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
+    spa_col_msk = ((spa_col_idx >= 0) &
+                   (spa_col_idx < s_lut_r * s_lut_r_s))
     spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
     # Load x values
     blk_x_idx = ((pid_blk * x_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and
-                 blk_x_idx < x_b * x_b_s)
+    blk_x_msk = ((blk_x_idx >= 0) &
+                 (blk_x_idx < x_b * x_b_s))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store sparsity layout value
@@ -204,7 +208,8 @@ def build_sparsity_layout_adaption_kernel(x,
                        // sparsity_block_size_to) * o_r_s) +
                      (((pid_col * TRITON_BLOCK_SIZE + spa_col * sparsity_block_size_from)
                        // sparsity_block_size_to) * o_c_s))
-        blk_o_msk = (blk_o_idx >= 0 and blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx >= 0) &
+                     (blk_o_idx < o_b * o_b_s))
         tl.store(o + blk_o_idx, 1, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/conversion.py RENAMED Viewed

@@ -5,9 +5,9 @@ from torch._library.triton import wrap_triton, triton_op
 from triton import language as tl
 from blksprs.layouting.sparsity_layout import build_sparsity_layout_adaption
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride
-from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_sparsity_dense, ensure_contiguous
@@ -46,10 +46,10 @@ def to_sparse(x: Tensor, sparsity_layout: Tensor,
     lut = to_sparse_build_lut(lut, sparsity_layout)
     if sparsity_layout.size(1) == 1 and sparsity_layout.size(2) == 1 and torch.all(sparsity_layout):
-        return BlksprsTensor(x)
+        return BlksprsTensor.wrap(x)
-    return BlksprsTensor(to_sparse_forward(x, sparsity_layout,
-                                           lut["sparsity_lut"], sparsity_block_size, lut["n_sparse_blocks"]))
+    return BlksprsTensor.wrap(to_sparse_forward(x, sparsity_layout,
+                                                lut["sparsity_lut"], sparsity_block_size, lut["n_sparse_blocks"]))
 @triton_op("blksprs::to_sparse_forward", mutates_args={})
@@ -120,16 +120,16 @@ def to_sparse_kernel(x,
                    tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + spa_col * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_d_msk = (blk_d_idx >= 0 and
-                 blk_d_idx < x_b * x_b_s)
+    blk_d_msk = ((blk_d_idx >= 0) &
+                 (blk_d_idx < x_b * x_b_s))
     blk_d = tl.load(x + blk_d_idx, mask=blk_d_msk)
     # Store block in sparse tensor
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE) * o_c_s))[None, :])
-    blk_o_msk = (blk_o_idx >= 0 and
-                 blk_o_idx < (pid_blk + 1) * o_b_s)
+    blk_o_msk = ((blk_o_idx >= 0) &
+                 (blk_o_idx < (pid_blk + 1) * o_b_s))
     tl.store(o + blk_o_idx, blk_d, mask=blk_o_msk)
@@ -201,7 +201,7 @@ def to_dense(x: BlksprsTensor, sparsity_layout: Tensor,
         return x
     return Tensor(to_dense_forward(x, sparsity_layout,
-                            lut["sparsity_reverse_lut"], sparsity_block_size, fill_value))
+                                   lut["sparsity_reverse_lut"], sparsity_block_size, fill_value))
 @triton_op("blksprs::to_dense_forward", mutates_args={})
@@ -269,7 +269,8 @@ def to_dense_kernel(x,
     # Get reverse sparsity index for current block
     rev_idx_spa_idx = (pid_blk * s_l_b_s + spa_row * s_l_r_s + spa_col * s_l_c_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_b * s_l_b_s)
+    rev_idx_spa_msk = ((rev_idx_spa_idx >= 0) &
+                       (rev_idx_spa_idx < s_l_b * s_l_b_s))
     rev_idx_spa = tl.load(sparsity_reverse_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
     # If block is present commence operations
@@ -279,14 +280,15 @@ def to_dense_kernel(x,
                      tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                    (((pid_col % (sparsity_block_size // TRITON_BLOCK_SIZE)) * TRITON_BLOCK_SIZE +
                      tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_msk = (blk_idx >= 0 and
-                   blk_idx < x_b * x_b_s)
+        blk_msk = ((blk_idx >= 0) &
+                   (blk_idx < x_b * x_b_s))
         blk = tl.load(x + blk_idx, mask=blk_msk)
         o_idx = (pid_blk * o_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        o_msk = (o_idx >= 0 and o_idx < o_b * o_b_s)
+        o_msk = ((o_idx >= 0) &
+                 (o_idx < o_b * o_b_s))
         tl.store(o + o_idx, blk, o_msk)
@@ -360,14 +362,14 @@ def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_
     validate_contiguous(sparsity_reverse_lut_from, sparsity_layout_to, sparsity_lut_to)
     if (sparsity_block_size_from == sparsity_block_size_to) and torch.equal(sparsity_layout_from, sparsity_layout_to):
-        return BlksprsTensor(x), sparsity_layout_to
+        return BlksprsTensor.wrap(x), sparsity_layout_to
-    return BlksprsTensor(adapt_layout_forward(x,
-                                              sparsity_layout_from, sparsity_reverse_lut_from,
-                                              sparsity_block_size_from,
-                                              sparsity_layout_to, sparsity_lut_to,
-                                              sparsity_block_size_to,
-                                              n_sparse_blocks_to)), sparsity_layout_to
+    return BlksprsTensor.wrap(adapt_layout_forward(x,
+                                                   sparsity_layout_from, sparsity_reverse_lut_from,
+                                                   sparsity_block_size_from,
+                                                   sparsity_layout_to, sparsity_lut_to,
+                                                   sparsity_block_size_to,
+                                                   n_sparse_blocks_to)), sparsity_layout_to
 @triton_op("blksprs::adapt_layout_forward", mutates_args={})
@@ -458,7 +460,8 @@ def adapt_layout_kernel(x,
     rev_idx_spa_x_idx = (spa_bat_x * s_l_x_b_s +
                          spa_row_x * s_l_x_r_s +
                          spa_col_x * s_l_x_c_s)
-    rev_idx_spa_x_msk = (rev_idx_spa_x_idx >= 0 and rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
+    rev_idx_spa_x_msk = ((rev_idx_spa_x_idx >= 0) &
+                         (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s))
     rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
     # If block is present commence operations
@@ -473,16 +476,16 @@ def adapt_layout_kernel(x,
         blk_x_idx = ((rev_idx_spa_x * x_b_s) +
                      ((shift_row_x * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((shift_col_x * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and
-                     blk_x_idx < x_b * x_b_s)
+        blk_x_msk = ((blk_x_idx >= 0) &
+                     (blk_x_idx < x_b * x_b_s))
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Store output
         blk_o_idx = ((pid_blk * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx >= 0 and
-                     blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx >= 0) &
+                     (blk_o_idx < o_b * o_b_s))
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/distribution.py RENAMED Viewed

@@ -5,9 +5,9 @@ from torch._library import triton_op
 from torch._library.triton import wrap_triton
 from triton import language as tl
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride
-from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_dtype_int, validate_sparsity_block_size, ensure_contiguous
@@ -45,9 +45,9 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
     lut = gather_build_lut(lut, sparsity_layout_src, sparsity_layout_idx)
-    return BlksprsTensor(gather_forward(src, sparsity_layout_src, lut["sparsity_reverse_lut_x"],
-                                        adjusted_dim, idx, sparsity_layout_idx, lut["sparsity_lut_i"],
-                                        sparsity_block_size))
+    return BlksprsTensor.wrap(gather_forward(src, sparsity_layout_src, lut["sparsity_reverse_lut_x"],
+                                             adjusted_dim, idx, sparsity_layout_idx, lut["sparsity_lut_i"],
+                                             sparsity_block_size))
 @triton_op("blksprs::gather_forward", mutates_args={})
@@ -136,8 +136,8 @@ def gather_kernel(x,
     blk_i_idx = ((pid_blk * i_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
-    blk_i_msk = (blk_i_idx >= 0 and
-                 blk_i_idx < i_b * i_b_s)
+    blk_i_msk = ((blk_i_idx >= 0) &
+                 (blk_i_idx < i_b * i_b_s))
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
     # Get indices of sparsity blocks and positions within the blocks
@@ -164,26 +164,26 @@ def gather_kernel(x,
     rev_idx_spa_x_idx = ((rev_dst_bat_x * s_l_x_b_s) +
                          (rev_dst_row_x * s_l_x_r_s) +
                          (rev_dst_col_x * s_l_x_c_s))
-    rev_idx_spa_x_msk = (rev_idx_spa_x_idx >= 0 and
-                         rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
+    rev_idx_spa_x_msk = ((rev_idx_spa_x_idx >= 0) &
+                         (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s))
     rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
     # Load x values
     blk_x_idx = ((rev_idx_spa_x * x_b_s) +
                  dst_row_x +
                  dst_col_x)
-    blk_x_msk = ((blk_x_idx >= 0 and
-                  blk_x_idx < x_b * x_b_s) and
-                 rev_idx_spa_x_msk != -1)
+    blk_x_msk = (((blk_x_idx >= 0) &
+                  (blk_x_idx < x_b * x_b_s)) &
+                 (rev_idx_spa_x_msk != -1))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store output
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = ((blk_o_idx >= 0 and
-                  blk_o_idx < o_b * o_b_s) and
-                 rev_idx_spa_x_msk != -1)
+    blk_o_msk = (((blk_o_idx >= 0) &
+                  (blk_o_idx < o_b * o_b_s)) &
+                 (rev_idx_spa_x_msk != -1))
     tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
@@ -276,11 +276,11 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
     lut = scatter_reduce_build_lut(lut, sparsity_layout_src, sparsity_layout_tgt)
-    return BlksprsTensor(scatter_reduce_forward(src, sparsity_layout_src, lut["sparsity_lut_x"],
-                                                adjusted_dim, idx,
-                                                sparsity_layout_tgt, lut["sparsity_reverse_lut_o"],
-                                                sparsity_block_size, lut["n_sparse_blocks"],
-                                                reduce_op))
+    return BlksprsTensor.wrap(scatter_reduce_forward(src, sparsity_layout_src, lut["sparsity_lut_x"],
+                                                     adjusted_dim, idx,
+                                                     sparsity_layout_tgt, lut["sparsity_reverse_lut_o"],
+                                                     sparsity_block_size, lut["n_sparse_blocks"],
+                                                     reduce_op))
 @triton_op("blksprs::scatter_reduce_forward", mutates_args={})
@@ -380,16 +380,16 @@ def scatter_reduce_kernel(x,
     blk_x_idx = ((pid_blk * x_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and
-                 blk_x_idx < x_b * x_b_s)
+    blk_x_msk = ((blk_x_idx >= 0) &
+                 (blk_x_idx < x_b * x_b_s))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load index values
     blk_i_idx = ((pid_blk * i_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * i_c_s)[None, :])
-    blk_i_msk = (blk_i_idx >= 0 and
-                 blk_i_idx < i_b * i_b_s)
+    blk_i_msk = ((blk_i_idx >= 0) &
+                 (blk_i_idx < i_b * i_b_s))
     blk_i = tl.load(i + blk_i_idx, mask=blk_i_msk).to(tl.int32)
     # Get indices of sparsity blocks and positions within the blocks
@@ -416,17 +416,17 @@ def scatter_reduce_kernel(x,
     rev_idx_spa_o_idx = ((rev_dst_bat_o * s_l_o_b_s) +
                          (rev_dst_row_o * s_l_o_r_s) +
                          (rev_dst_col_o * s_l_o_c_s))
-    rev_idx_spa_o_msk = (rev_idx_spa_o_idx >= 0 and
-                         rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s)
+    rev_idx_spa_o_msk = ((rev_idx_spa_o_idx >= 0) &
+                         (rev_idx_spa_o_idx < s_l_o_b * s_l_o_b_s))
     rev_idx_spa_o = tl.load(r_lut_o + rev_idx_spa_o_idx, mask=rev_idx_spa_o_msk).to(tl.int32)
     # Store output
     blk_o_idx = ((rev_idx_spa_o * o_b_s) +
                  dst_row_o +
                  dst_col_o)
-    blk_o_msk = ((blk_o_idx >= 0 and
-                  blk_o_idx < o_b * o_b_s) and
-                 rev_idx_spa_o_msk != -1)
+    blk_o_msk = (((blk_o_idx >= 0) &
+                  (blk_o_idx < o_b * o_b_s)) &
+                 (rev_idx_spa_o_msk != -1))
     if reduce_op_ind == 0:
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/flow.py RENAMED Viewed

@@ -5,8 +5,8 @@ from torch._library import triton_op
 from torch._library.triton import wrap_triton
 from triton import language as tl
-from blksprs.utils.tools import stride
 from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
+from blksprs.utils.tools import stride
 @triton_op("blksprs::flow_pull_forward", mutates_args={})
@@ -78,22 +78,23 @@ def flow_pull_kernel(x,
     rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
                        spa_row * s_l_o_r_s +
                        spa_col * s_l_o_c_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
+    rev_idx_spa_msk = ((rev_idx_spa_idx >= 0) &
+                       (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s))
     rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
     if rev_idx_spa >= 0:
         blk_x_idx = (rev_idx_spa * x_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and
-                     blk_x_idx < x_b * x_b_s)
+        blk_x_msk = ((blk_x_idx >= 0) &
+                     (blk_x_idx < x_b * x_b_s))
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         blk_o_idx = (pid_blk * o_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx >= 0 and
-                     blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx >= 0) &
+                     (blk_o_idx < o_b * o_b_s))
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
@@ -165,20 +166,21 @@ def flow_push_kernel(x,
     rev_idx_spa_idx = (spa_bat * s_l_x_b_s +
                        spa_row * s_l_x_r_s +
                        spa_col * s_l_x_c_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_x_b * s_l_x_b_s)
+    rev_idx_spa_msk = ((rev_idx_spa_idx >= 0) &
+                       (rev_idx_spa_idx < s_l_x_b * s_l_x_b_s))
     rev_idx_spa = tl.load(r_lut + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
     if rev_idx_spa >= 0:
         blk_x_idx = (pid_blk * x_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and
-                     blk_x_idx < x_b * x_b_s)
+        blk_x_msk = ((blk_x_idx >= 0) &
+                     (blk_x_idx < x_b * x_b_s))
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         blk_o_idx = (rev_idx_spa * o_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx >= 0 and
-                     blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx >= 0) &
+                     (blk_o_idx < o_b * o_b_s))
         tl.atomic_add(o + blk_o_idx, blk_x, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/matmul.py RENAMED Viewed

@@ -5,9 +5,9 @@ from torch.library import triton_op, wrap_triton
 from triton import language as tl
 from blksprs.ops.transpose import transpose
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride
-from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_dtype_float, ensure_contiguous
@@ -47,11 +47,11 @@ def matmul(x: BlksprsTensor, sparsity_layout_x: Tensor,
     lut = matmul_build_lut(lut, sparsity_layout_x, sparsity_layout_y, sparsity_layout_output)
-    return BlksprsTensor(matmul_forward(x, y,
-                                        sparsity_layout_x, lut["sparsity_reverse_lut_x"],
-                                        sparsity_layout_y, lut["sparsity_reverse_lut_y"],
-                                        sparsity_layout_output, lut["sparsity_lut_o"],
-                                        sparsity_block_size, lut["n_sparse_blocks"]))
+    return BlksprsTensor.wrap(matmul_forward(x, y,
+                                             sparsity_layout_x, lut["sparsity_reverse_lut_x"],
+                                             sparsity_layout_y, lut["sparsity_reverse_lut_y"],
+                                             sparsity_layout_output, lut["sparsity_lut_o"],
+                                             sparsity_block_size, lut["n_sparse_blocks"]))
 @triton_op("blksprs::matmul_forward", mutates_args={})
@@ -169,12 +169,14 @@ def matmul_kernel(x,
         rev_idx_spa_x_idx = (spa_bat_o * s_l_x_b_s +
                              spa_row_o * s_l_x_r_s +
                              i_seg_spa * s_l_x_c_s)
-        rev_idx_spa_x_msk = (rev_idx_spa_x_idx >= 0 and rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s)
+        rev_idx_spa_x_msk = ((rev_idx_spa_x_idx >= 0) &
+                             (rev_idx_spa_x_idx < s_l_x_b * s_l_x_b_s))
         rev_idx_spa_x = tl.load(r_lut_x + rev_idx_spa_x_idx, mask=rev_idx_spa_x_msk).to(tl.int32)
         # Get reverse sparsity indices for y
         rev_idx_spa_y_idx = (spa_bat_o * s_l_y_b_s + i_seg_spa * s_l_y_r_s + spa_col_o * s_l_y_c_s)
-        rev_idx_spa_y_msk = (rev_idx_spa_y_idx >= 0 and rev_idx_spa_y_idx < s_l_y_b * s_l_y_b_s)
+        rev_idx_spa_y_msk = ((rev_idx_spa_y_idx >= 0) &
+                             (rev_idx_spa_y_idx < s_l_y_b * s_l_y_b_s))
         rev_idx_spa_y = tl.load(r_lut_y + rev_idx_spa_y_idx, mask=rev_idx_spa_y_msk).to(tl.int32)
         # If both blocks are present commence calculation
@@ -183,16 +185,16 @@ def matmul_kernel(x,
                          ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                          ((i_seg_tri_mod * TRITON_BLOCK_SIZE +
                            tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-            blk_x_msk = (blk_x_idx >= 0 and
-                         blk_x_idx < x_b * x_b_s)
+            blk_x_msk = ((blk_x_idx >= 0) &
+                         (blk_x_idx < x_b * x_b_s))
             blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
             blk_y_idx = ((rev_idx_spa_y * y_b_s) +
                          ((i_seg_tri_mod * TRITON_BLOCK_SIZE +
                            tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
                          ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * y_c_s)[None, :])
-            blk_y_msk = (blk_y_idx >= 0 and
-                         blk_y_idx < y_b * y_b_s)
+            blk_y_msk = ((blk_y_idx >= 0) &
+                         (blk_y_idx < y_b * y_b_s))
             blk_y = tl.load(y + blk_y_idx, mask=blk_y_msk)
             # Perform matrix multiplication
@@ -205,8 +207,8 @@ def matmul_kernel(x,
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx >= 0 and
-                 blk_o_idx < o_b * o_b_s)
+    blk_o_msk = ((blk_o_idx >= 0) &
+                 (blk_o_idx < o_b * o_b_s))
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/misc/broadcast_ops.py RENAMED Viewed

@@ -5,9 +5,9 @@ from torch._library import triton_op
 from torch._library.triton import wrap_triton
 from triton import language as tl
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride
-from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_device, \
     validate_sparsity_block_size, ensure_contiguous
@@ -43,7 +43,7 @@ def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
     validate_contiguous(sparsity_layout_output, sparsity_lut_o)
-    return BlksprsTensor(broadcast_add_forward(x, y, sparsity_lut_o, sparsity_block_size, n_sparse_blocks))
+    return BlksprsTensor.wrap(broadcast_add_forward(x, y, sparsity_lut_o, sparsity_block_size, n_sparse_blocks))
 def broadcast_sub(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
@@ -121,16 +121,16 @@ def broadcast_add_kernel(x,
     blk_x_idx = (spa_bat_o * x_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + spa_row_o * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and
-                 blk_x_idx < x_b * x_b_s)
+    blk_x_msk = ((blk_x_idx >= 0) &
+                 (blk_x_idx < x_b * x_b_s))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load y block
     blk_y_idx = (spa_bat_o * y_b_s +
                  ((pid_col * TRITON_BLOCK_SIZE + spa_col_o * sparsity_block_size +
                    tl.arange(0, TRITON_BLOCK_SIZE)) * y_c_s)[None, :])
-    blk_y_msk = (blk_y_idx >= 0 and
-                 blk_y_idx < y_b * y_b_s)
+    blk_y_msk = ((blk_y_idx >= 0) &
+                 (blk_y_idx < y_b * y_b_s))
     blk_y = tl.load(y + blk_y_idx, mask=blk_y_msk)
     # Compute sum
@@ -141,6 +141,6 @@ def broadcast_add_kernel(x,
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx >= 0 and
-                 blk_o_idx < o_b * o_b_s)
+    blk_o_msk = ((blk_o_idx >= 0) &
+                 (blk_o_idx < o_b * o_b_s))
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/misc/row_wise.py RENAMED Viewed

@@ -55,7 +55,7 @@ def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
     validate_contiguous(sparsity_layout, sparsity_lut,
                         sparsity_layout_output, sparsity_reverse_lut_output)
-    return BlksprsTensor(row_wise_sum_forward(
+    return BlksprsTensor.wrap(row_wise_sum_forward(
         x, sparsity_lut, sparsity_layout_output, sparsity_reverse_lut_output,
         sparsity_block_size, n_sparse_blocks_output, flag_slice_only)), sparsity_layout_output
@@ -130,15 +130,16 @@ def row_wise_sum_kernel(x,
     # Load reverse sparsity index for current block
     rev_idx_spa_idx = (spa_bat_x * s_l_o_b_s +
                        spa_row_x * s_l_o_r_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
+    rev_idx_spa_msk = ((rev_idx_spa_idx >= 0) &
+                       (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s))
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
     if rev_idx_spa >= 0:
         blk_idx = ((pid_blk * x_b_s) +
                    ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                    ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_msk = (blk_idx >= 0 and
-                   blk_idx < x_b * x_b_s)
+        blk_msk = ((blk_idx >= 0) &
+                   (blk_idx < x_b * x_b_s))
         blk = tl.load(x + blk_idx, mask=blk_msk)
         buf = tl.reshape(tl.sum(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
@@ -146,8 +147,8 @@ def row_wise_sum_kernel(x,
         o_idx = (rev_idx_spa * o_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  (tl.arange(0, 1))[None, :])
-        o_msk = (o_idx >= 0 and
-                 o_idx < o_b * o_b_s)
+        o_msk = ((o_idx >= 0) &
+                 (o_idx < o_b * o_b_s))
         tl.atomic_add(o + o_idx, buf, o_msk)
@@ -174,8 +175,6 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
             of the input and the sparsity layout of the output tensor.
     """
-    # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376, should be fixed with the upcoming 3.4.0 release
-    x = torch.where(x == -0.0, torch.tensor(0.0), x)
     x = ensure_contiguous(x)
     validate_dimensions(x)
@@ -197,7 +196,7 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
     validate_contiguous(sparsity_layout, sparsity_lut,
                         sparsity_layout_output, sparsity_reverse_lut_output)
-    return BlksprsTensor(
+    return BlksprsTensor.wrap(
         row_wise_max_forward(x, sparsity_lut, sparsity_layout_output, sparsity_reverse_lut_output, sparsity_block_size,
                              n_sparse_blocks_output, flag_slice_only)), sparsity_layout_output
@@ -274,15 +273,16 @@ def row_wise_max_kernel(x,
     # Load reverse sparsity index for current block
     rev_idx_spa_idx = (spa_bat_x * s_l_o_b_s +
                        spa_row_x * s_l_o_r_s)
-    rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
+    rev_idx_spa_msk = ((rev_idx_spa_idx >= 0) &
+                       (rev_idx_spa_idx < s_l_o_b * s_l_o_b_s))
     rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
     if rev_idx_spa >= 0:
         blk_idx = ((pid_blk * x_b_s) +
                    ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                    ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_msk = (blk_idx >= 0 and
-                   blk_idx < x_b * x_b_s)
+        blk_msk = ((blk_idx >= 0) &
+                   (blk_idx < x_b * x_b_s))
         blk = tl.load(x + blk_idx, mask=blk_msk)
         buf = tl.reshape(tl.max(blk, axis=-1), (TRITON_BLOCK_SIZE, 1))
@@ -290,8 +290,8 @@ def row_wise_max_kernel(x,
         o_idx = (rev_idx_spa * o_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  (tl.arange(0, 1))[None, :])
-        o_msk = (o_idx >= 0 and
-                 o_idx < o_b * o_b_s)
+        o_msk = ((o_idx >= 0) &
+                 (o_idx < o_b * o_b_s))
         tl.atomic_max(o + o_idx, buf, o_msk)
@@ -329,8 +329,8 @@ def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
     validate_contiguous(sparsity_layout_x, sparsity_lut_x, sparsity_reverse_lut_rwm)
-    return BlksprsTensor(row_wise_add_forward(x, sparsity_lut_x, sparsity_layout_rwm,
-                                              sparsity_reverse_lut_rwm, y, sparsity_block_size))
+    return BlksprsTensor.wrap(row_wise_add_forward(x, sparsity_lut_x, sparsity_layout_rwm,
+                                                   sparsity_reverse_lut_rwm, y, sparsity_block_size))
 def row_wise_sub(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
@@ -412,7 +412,8 @@ def row_wise_add_kernel(x,
     # Get reverse sparsity indices for s
     rev_idx_spa_s_idx = (spa_bat_x * s_l_y_b_s +
                          spa_row_x * s_l_y_r_s)
-    rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_y_b * s_l_y_b_s)
+    rev_idx_spa_s_msk = ((rev_idx_spa_s_idx >= 0) &
+                         (rev_idx_spa_s_idx < s_l_y_b * s_l_y_b_s))
     rev_idx_spa_s = tl.load(r_lut_y + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
     if rev_idx_spa_s == -1:
@@ -423,16 +424,16 @@ def row_wise_add_kernel(x,
     blk_x_idx = ((pid_blk * x_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-    blk_x_msk = (blk_x_idx >= 0 and
-                 blk_x_idx < x_b * x_b_s)
+    blk_x_msk = ((blk_x_idx >= 0) &
+                 (blk_x_idx < x_b * x_b_s))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Load sum block
     blk_s_idx = (rev_idx_spa_s * y_b_s +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * y_r_s)[:, None] +
                  (tl.arange(0, 1) * y_c_s)[None, :])
-    blk_s_msk = (blk_s_idx >= 0 and
-                 blk_s_idx < y_b * y_b_s)
+    blk_s_msk = ((blk_s_idx >= 0) &
+                 (blk_s_idx < y_b * y_b_s))
     blk_s = tl.load(y + blk_s_idx, mask=blk_s_msk)
     # Compute exp
@@ -442,6 +443,6 @@ def row_wise_add_kernel(x,
     blk_o_idx = ((pid_blk * o_b_s) +
                  ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-    blk_o_msk = (blk_o_idx >= 0 and
-                 blk_o_idx < o_b * o_b_s)
+    blk_o_msk = ((blk_o_idx >= 0) &
+                 (blk_o_idx < o_b * o_b_s))
     tl.store(o + blk_o_idx, buf, mask=blk_o_msk)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/partitioning.py RENAMED Viewed

@@ -41,7 +41,7 @@ def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
     lut = split_build_lut(lut, sparsity_layout, partitions)
-    return BlksprsTensor(split_forward(
+    return BlksprsTensor.wrap(split_forward(
         x, lut["sparsity_layout_output"], lut["sparsity_lut"], lut["sparsity_reverse_lut"],
         partitions, adjusted_dim, sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_output"]
@@ -146,7 +146,7 @@ def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
     lut = merge_build_lut(lut, sparsity_layout, partitions)
-    return BlksprsTensor(merge_forward(
+    return BlksprsTensor.wrap(merge_forward(
         x, lut["sparsity_layout_output"], lut["sparsity_lut"], lut["sparsity_reverse_lut"],
         partitions, adjusted_dim, sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_output"]

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/repeat.py RENAMED Viewed

@@ -46,7 +46,7 @@ def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int,
     lut = repeat_build_lut(lut, sparsity_layout_x, repeats, sparsity_layout_output)
-    return BlksprsTensor(repeat_forward(
+    return BlksprsTensor.wrap(repeat_forward(
         x, sparsity_layout_x, lut["sparsity_layout_o"], lut["sparsity_lut"],
         lut["sparsity_reverse_lut"], sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_o"]
@@ -87,7 +87,7 @@ def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
     lut = repeat_interleave_build_lut(lut, sparsity_layout_x, repeats, sparsity_layout_output)
-    return BlksprsTensor(repeat_forward(
+    return BlksprsTensor.wrap(repeat_forward(
         x, sparsity_layout_x, lut["sparsity_layout_o"], lut["sparsity_lut"],
         lut["sparsity_reverse_lut"], sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_o"]

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/softmax.py RENAMED Viewed

@@ -1,5 +1,3 @@
-import pdb
 import torch
 import triton
 from torch import Tensor
@@ -8,9 +6,9 @@ from torch._library.triton import wrap_triton
 from triton import language as tl
 from blksprs.ops.misc.row_wise import row_wise_sum, row_wise_max, row_wise_sub
+from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.blksprs_tensor import BlksprsTensor
 from blksprs.utils.tools import stride, ceil_pow2
-from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
 from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
     validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32, ensure_contiguous
@@ -55,10 +53,10 @@ def softmax_regular(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_si
     lut = softmax_build_lut(lut, sparsity_layout)
-    return BlksprsTensor(softmax_forward(x, sparsity_layout,
-                                         lut["sparsity_lut"],
-                                         lut["sparsity_reverse_lut_rws"],
-                                         sparsity_block_size))
+    return BlksprsTensor.wrap(softmax_forward(x, sparsity_layout,
+                                              lut["sparsity_lut"],
+                                              lut["sparsity_reverse_lut_rws"],
+                                              sparsity_block_size))
 @triton_op("blksprs::softmax_forward", mutates_args={})
@@ -186,7 +184,8 @@ def softmax_kernel(x,
     # Get reverse sparsity indices for s
     rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
                          spa_row * s_l_s_r_s)
-    rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
+    rev_idx_spa_s_msk = ((rev_idx_spa_s_idx >= 0) &
+                         (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s))
     rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
     if rev_idx_spa_s >= 0:
@@ -194,16 +193,16 @@ def softmax_kernel(x,
         blk_x_idx = ((pid_blk * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and
-                     blk_x_idx < x_b * x_b_s)
+        blk_x_msk = ((blk_x_idx >= 0) &
+                     (blk_x_idx < x_b * x_b_s))
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         # Load sum block
         blk_s_idx = (rev_idx_spa_s * s_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = (blk_s_idx >= 0 and
-                     blk_s_idx < s_b * s_b_s)
+        blk_s_msk = ((blk_s_idx >= 0) &
+                     (blk_s_idx < s_b * s_b_s))
         blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
         # Compute softmax
@@ -249,29 +248,30 @@ def softmax_kernel_grad(g,
     rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
                          spa_row * s_l_s_r_s)
-    rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s)
+    rev_idx_spa_s_msk = ((rev_idx_spa_s_idx >= 0) &
+                         (rev_idx_spa_s_idx < s_l_s_b * s_l_s_b_s))
     rev_idx_spa_s = tl.load(r_lut_s + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
     if rev_idx_spa_s >= 0:
         blk_s_idx = (rev_idx_spa_s * s_b_s +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * s_r_s)[:, None] +
                      (tl.arange(0, 1) * s_c_s)[None, :])
-        blk_s_msk = (blk_s_idx >= 0 and
-                     blk_s_idx < s_b * s_b_s)
+        blk_s_msk = ((blk_s_idx >= 0) &
+                     (blk_s_idx < s_b * s_b_s))
         blk_s = tl.load(s + blk_s_idx, mask=blk_s_msk)
         blk_g_idx = ((pid_blk * g_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * g_c_s)[None, :])
-        blk_g_msk = (blk_g_idx >= 0 and
-                     blk_g_idx < g_b * g_b_s)
+        blk_g_msk = ((blk_g_idx >= 0) &
+                     (blk_g_idx < g_b * g_b_s))
         blk_g = tl.load(g + blk_g_idx, mask=blk_g_msk)
         blk_x_idx = ((pid_blk * x_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * x_c_s)[None, :])
-        blk_x_msk = (blk_x_idx >= 0 and
-                     blk_x_idx < x_b * x_b_s)
+        blk_x_msk = ((blk_x_idx >= 0) &
+                     (blk_x_idx < x_b * x_b_s))
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
         buf = blk_x * (blk_g - blk_s)
@@ -279,8 +279,8 @@ def softmax_kernel_grad(g,
         blk_o_idx = ((pid_blk * o_b_s) +
                      ((pid_row * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_r_s)[:, None] +
                      ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
-        blk_o_msk = (blk_o_idx >= 0 and
-                     blk_o_idx < o_b * o_b_s)
+        blk_o_msk = ((blk_o_idx >= 0) &
+                     (blk_o_idx < o_b * o_b_s))
         tl.store(o + blk_o_idx, buf, mask=blk_o_msk)
@@ -346,10 +346,10 @@ def softmax_fused(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size
     lut = softmax_fused_build_lut(lut, sparsity_layout)
-    return BlksprsTensor(softmax_fused_forward(x, sparsity_layout,
-                                               lut["sparsity_reverse_lut_sorted"],
-                                               lut["max_blocks_line"],
-                                               sparsity_block_size))
+    return BlksprsTensor.wrap(softmax_fused_forward(x, sparsity_layout,
+                                                    lut["sparsity_reverse_lut_sorted"],
+                                                    lut["max_blocks_line"],
+                                                    sparsity_block_size))
 @triton_op("blksprs::softmax_fused_forward", mutates_args={})
@@ -449,7 +449,8 @@ def softmax_fused_kernel(x,
     blk_rev_idx = (pid_bat * s_l_b_s +
                    pid_row * s_l_r_s +
                    (tl.arange(0, mbs) * s_l_c_s))
-    blk_rev_msk = ((blk_rev_idx >= 0 and blk_rev_idx < s_l_b * s_l_b_s) and
+    blk_rev_msk = (((blk_rev_idx >= 0) &
+                    (blk_rev_idx < s_l_b * s_l_b_s)) &
                    (tl.arange(0, mbs) < s_l_c))
     blk_rev = tl.load(r_lut_s + blk_rev_idx, mask=blk_rev_msk, other=-1).to(tl.int32)
@@ -464,8 +465,9 @@ def softmax_fused_kernel(x,
         blk_x_idx = (blk_rev_ext * x_b_s +
                      pid_lin * x_r_s +
                      (tl.arange(0, mbs * sparsity_block_size) % sparsity_block_size) * x_c_s)
-        blk_x_mask = ((blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
-                      and blk_rev_ext != -1)
+        blk_x_mask = (((blk_x_idx >= 0) &
+                       (blk_x_idx < x_b * x_b_s)) &
+                      (blk_rev_ext != -1))
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_mask, other=float("-inf"))
         # Compute softmax
@@ -502,7 +504,8 @@ def softmax_fused_kernel_grad(g,
     blk_rev_idx = (pid_bat * s_l_b_s +
                    pid_row * s_l_r_s +
                    (tl.arange(0, mbs) * s_l_c_s))
-    blk_rev_msk = ((blk_rev_idx >= 0 and blk_rev_idx < s_l_b * s_l_b_s) and
+    blk_rev_msk = (((blk_rev_idx >= 0) &
+                    (blk_rev_idx < s_l_b * s_l_b_s)) &
                    (tl.arange(0, mbs) < s_l_c))
     blk_rev = tl.load(r_lut_s + blk_rev_idx, mask=blk_rev_msk, other=-1).to(tl.int32)
@@ -517,16 +520,18 @@ def softmax_fused_kernel_grad(g,
         blk_g_idx = (blk_rev_ext * g_b_s +
                      pid_lin * g_r_s +
                      (tl.arange(0, mbs * sparsity_block_size) % sparsity_block_size) * g_c_s)
-        blk_g_mask = ((blk_g_idx >= 0 and blk_g_idx < g_b * g_b_s)
-                      and blk_rev_ext != -1)
+        blk_g_mask = (((blk_g_idx >= 0) &
+                       (blk_g_idx < g_b * g_b_s)) &
+                      (blk_rev_ext != -1))
         blk_g = tl.load(g + blk_g_idx, mask=blk_g_mask)
         # Load line of x
         blk_x_idx = (blk_rev_ext * x_b_s +
                      pid_lin * x_r_s +
                      (tl.arange(0, mbs * sparsity_block_size) % sparsity_block_size) * x_c_s)
-        blk_x_mask = ((blk_x_idx >= 0 and blk_x_idx < x_b * x_b_s)
-                      and blk_rev_ext != -1)
+        blk_x_mask = (((blk_x_idx >= 0) &
+                       (blk_x_idx < x_b * x_b_s)) &
+                      (blk_rev_ext != -1))
         blk_x = tl.load(x + blk_x_idx, mask=blk_x_mask)
         # Compute gradients

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/ops/transpose.py RENAMED Viewed

@@ -37,9 +37,9 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor,
     lut = transpose_build_lut(lut, sparsity_layout)
-    return BlksprsTensor(transpose_forward(x, lut["sparsity_layout_t"],
-                                           lut["sparsity_lut"], lut["sparsity_reverse_lut"],
-                                           sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_t"]
+    return BlksprsTensor.wrap(transpose_forward(x, lut["sparsity_layout_t"],
+                                                lut["sparsity_lut"], lut["sparsity_reverse_lut"],
+                                                sparsity_block_size, lut["n_sparse_blocks"])), lut["sparsity_layout_t"]
 @triton_op("blksprs::transpose_forward", mutates_args={})

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/utils/autotuning.py RENAMED Viewed

@@ -75,4 +75,4 @@ def get_autotune_configs():
         autotune_configs.append(
             triton.Config({"TRITON_BLOCK_SIZE": block_size}, num_stages=num_stages, num_warps=num_warps))
-    return autotune_configs
+    return autotune_configs

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/utils/blksprs_tensor.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from typing import Union
 import torch
 from torch import Tensor
@@ -7,4 +9,11 @@ class BlksprsTensor(Tensor):
     """
     def __repr__(self):
-        return f"BlksprsTensor({torch.Tensor(self).__repr__()})"
+        return f"BlksprsTensor({torch.Tensor(self).__repr__()})"
+    @staticmethod
+    def wrap(tensor: Tensor) -> Union[Tensor, "BlksprsTensor"]:
+        if torch._dynamo.is_compiling():
+            return tensor
+        else:
+            return BlksprsTensor(tensor)

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/utils/processing.py RENAMED Viewed

@@ -26,7 +26,8 @@ def apply_torch_linear(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block
     # Apply weights
     sparsity_layout_xw = build_sparsity_layout_matmul_fast(sparsity_layout, sparsity_layout_w_t)
-    xw = matmul(x, sparsity_layout, BlksprsTensor(w_t_bs.to(x.dtype)), sparsity_layout_w_t, sparsity_layout_xw, sparsity_block_size)
+    xw = matmul(x, sparsity_layout, BlksprsTensor.wrap(w_t_bs.to(x.dtype)), sparsity_layout_w_t, sparsity_layout_xw,
+                sparsity_block_size)
     interim = xw
     # Apply bias

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/utils/tools.py RENAMED Viewed

@@ -1,9 +1,5 @@
-import torch
 from torch import Tensor, Size
-# Capture scalar outputs for JIT compilation
-torch._dynamo.config.capture_scalar_outputs = True
 def do_shape_blocksparse(x: Tensor) -> tuple[Tensor, Size]:
     if x.dim() == 3:
@@ -27,7 +23,8 @@ def stride(x: Tensor):
     else:
         raise NotImplementedError
 def ceil_pow2(x: int) -> int:
     if x <= 0:
         raise ValueError("Input must be a positive integer.")
-    return 1 << (x - 1).bit_length()
+    return 1 << (x - 1).bit_length()

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs.egg-info/PKG-INFO RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.1.7
+Version: 2.1.9
 Summary: A lightweight library for operations on block-sparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
 Project-URL: Bugtracker, https://github.com/FelixSchoen/blksprs/issues
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
-Requires-Dist: torch
+Requires-Dist: torch>=2.8.0
 Requires-Dist: numpy
 Provides-Extra: test
 Requires-Dist: pytest; extra == "test"
@@ -102,7 +102,7 @@ We will continue to maintain the library and fix any issues that arise.
 Should you find any bugs please open an [issue](https://github.com/FelixSchoen/blksprs/issues).
 We also encourage [pull requests](https://github.com/FelixSchoen/blksprs/pulls).
-It might be that this changes with future projects, but as of June 2025, we are content with the current state of the
+It might be that this changes with future projects, but as of August 2025, we are content with the current state of the
 library.
 ## Known Limitations and Issues

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,4 @@
-torch
+torch>=2.8.0
 numpy
 [test]

{blksprs-2.1.7 → blksprs-2.1.9}/pyproject.toml RENAMED Viewed

@@ -1,13 +1,13 @@
 [project]
 name = "blksprs"
-version = "2.1.7"
+version = "2.1.9"
 authors = [{ name = "Felix Schön", email = "schoen@kr.tuwien.ac.at" }]
 description = "A lightweight library for operations on block-sparse matrices in PyTorch."
 readme = "README.md"
 requires-python = ">=3.11"
 license = { file = "LICENSE.md" }
 dependencies = [
-    "torch",
+    "torch >= 2.8.0",
     "numpy"
 ]

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/utils/benchmarking.py RENAMED Viewed

File without changes

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs/utils/validation.py RENAMED Viewed

File without changes

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{blksprs-2.1.7 → blksprs-2.1.9}/blksprs.egg-info/top_level.txt RENAMED Viewed

File without changes

{blksprs-2.1.7 → blksprs-2.1.9}/setup.cfg RENAMED Viewed

File without changes

blksprs 2.1.7__tar.gz → 2.1.9__tar.gz

blksprs 2.1.7tar.gz → 2.1.9tar.gz