blksprs 2.0rc6__tar.gz → 2.0rc7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {blksprs-2.0rc6 → blksprs-2.0rc7}/PKG-INFO +7 -3
  2. {blksprs-2.0rc6 → blksprs-2.0rc7}/README.md +6 -2
  3. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/distribution.py +1 -1
  4. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/misc/broadcast_ops.py +1 -0
  5. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/misc/row_wise.py +7 -3
  6. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/utils/tools.py +0 -9
  7. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs.egg-info/PKG-INFO +7 -3
  8. {blksprs-2.0rc6 → blksprs-2.0rc7}/pyproject.toml +1 -1
  9. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/__init__.py +0 -0
  10. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/layouting/distribution_layout.py +0 -0
  11. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/layouting/sparsity_layout.py +0 -0
  12. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/conversion.py +0 -0
  13. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/flow.py +0 -0
  14. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/matmul.py +0 -0
  15. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/partitioning.py +0 -0
  16. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/repeat.py +0 -0
  17. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/softmax.py +0 -0
  18. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/ops/transpose.py +0 -0
  19. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/utils/autotuning.py +0 -0
  20. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/utils/benchmarking.py +0 -0
  21. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/utils/blksprs_tensor.py +0 -0
  22. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/utils/processing.py +0 -0
  23. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs/utils/validation.py +0 -0
  24. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs.egg-info/SOURCES.txt +0 -0
  25. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs.egg-info/dependency_links.txt +0 -0
  26. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs.egg-info/requires.txt +0 -0
  27. {blksprs-2.0rc6 → blksprs-2.0rc7}/blksprs.egg-info/top_level.txt +0 -0
  28. {blksprs-2.0rc6 → blksprs-2.0rc7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: blksprs
3
- Version: 2.0rc6
3
+ Version: 2.0rc7
4
4
  Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
5
5
  Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
6
6
  Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -108,12 +108,16 @@ library.
108
108
 
109
109
  ## Known Limitations and Issues
110
110
 
111
+ - Triton has a bug with `tl.atomix_max()` used for the row-wise max operation.
112
+ In order to work around this bug a manual conversion of some values is needed, (slightly) negatively impacting
113
+ performance.
114
+ Watch the [issue](https://github.com/triton-lang/triton/issues/6376) on Triton's issue tracker for more information.
111
115
  - PyTorch's `wrap_triton()` currently does not support config pruning. It thus cannot be used for some of the kernels,
112
116
  which could impact graph compilation.
113
117
  - There seem to be some issues with autocasting, forcing some operations to manually cast.
114
118
  - There will be some slight numerical differences between vanilla and blksprs operations.
115
- These instabilities are due to Triton and thus cannot be fixed by this library alone.
116
- However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
119
+ These instabilities are due to Triton and thus cannot be fixed by this library alone.
120
+ However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
117
121
 
118
122
  ## Usage
119
123
 
@@ -89,12 +89,16 @@ library.
89
89
 
90
90
  ## Known Limitations and Issues
91
91
 
92
+ - Triton has a bug with `tl.atomix_max()` used for the row-wise max operation.
93
+ In order to work around this bug a manual conversion of some values is needed, (slightly) negatively impacting
94
+ performance.
95
+ Watch the [issue](https://github.com/triton-lang/triton/issues/6376) on Triton's issue tracker for more information.
92
96
  - PyTorch's `wrap_triton()` currently does not support config pruning. It thus cannot be used for some of the kernels,
93
97
  which could impact graph compilation.
94
98
  - There seem to be some issues with autocasting, forcing some operations to manually cast.
95
99
  - There will be some slight numerical differences between vanilla and blksprs operations.
96
- These instabilities are due to Triton and thus cannot be fixed by this library alone.
97
- However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
100
+ These instabilities are due to Triton and thus cannot be fixed by this library alone.
101
+ However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
98
102
 
99
103
  ## Usage
100
104
 
@@ -240,7 +240,7 @@ def scatter(src: BlksprsTensor, sparsity_layout_src: Tensor,
240
240
  reduce_op="none", lut=lut)
241
241
 
242
242
 
243
- @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
243
+ @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
244
244
  def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
245
245
  dim: int,
246
246
  idx: BlksprsTensor,
@@ -12,6 +12,7 @@ from blksprs.utils.validation import validate_contiguous, validate_device, \
12
12
  validate_sparsity_block_size
13
13
 
14
14
 
15
+ @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
15
16
  def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
16
17
  sparsity_block_size: int) -> BlksprsTensor:
17
18
  """Performs a broadcast and subsequent addition of two dense tensors x and y. Returns a block-sparse tensor in
@@ -4,9 +4,9 @@ from torch import Tensor
4
4
  from torch._library.triton import wrap_triton, triton_op
5
5
  from triton import language as tl
6
6
 
7
- from blksprs.utils.blksprs_tensor import BlksprsTensor
8
- from blksprs.utils.tools import stride, get_autocast_min_val
9
7
  from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
8
+ from blksprs.utils.blksprs_tensor import BlksprsTensor
9
+ from blksprs.utils.tools import stride
10
10
  from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, validate_sparsity, \
11
11
  validate_sparsity_block_size
12
12
 
@@ -95,6 +95,7 @@ def row_wise_sum_forward(x: Tensor, sparsity_lut: Tensor,
95
95
  return output
96
96
 
97
97
 
98
+ # noinspection PyUnusedLocal
98
99
  @triton.autotune(
99
100
  configs=get_autotune_configs(),
100
101
  key=["sparsity_block_size"],
@@ -175,6 +176,8 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
175
176
  of the input and the sparsity layout of the output tensor.
176
177
 
177
178
  """
179
+ # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376
180
+ x = torch.where(x == -0.0, torch.tensor(0.0), x)
178
181
  x = x.contiguous()
179
182
 
180
183
  validate_dimensions(x)
@@ -209,7 +212,7 @@ def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
209
212
  output = torch.full(size=(n_sparse_blocks_output,
210
213
  sparsity_block_size,
211
214
  1 if flag_slice_only else sparsity_block_size),
212
- fill_value=get_autocast_min_val(),
215
+ fill_value=torch.finfo(x.dtype).min,
213
216
  device=x.device)
214
217
 
215
218
  x_b, x_r, x_c = x.size()
@@ -238,6 +241,7 @@ def row_wise_max_forward(x: Tensor, sparsity_lut: Tensor,
238
241
  return output
239
242
 
240
243
 
244
+ # noinspection PyUnusedLocal
241
245
  @triton.autotune(
242
246
  configs=get_autotune_configs(),
243
247
  key=["sparsity_block_size"],
@@ -26,12 +26,3 @@ def stride(x: Tensor):
26
26
  return x.size(1) * x.size(2), x.size(2), 1
27
27
  else:
28
28
  raise NotImplementedError
29
-
30
-
31
- def get_autocast_min_val():
32
- if torch.is_autocast_enabled():
33
- dtype = torch.get_autocast_dtype("cuda")
34
- else:
35
- dtype = torch.float
36
-
37
- return torch.finfo(dtype).min
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: blksprs
3
- Version: 2.0rc6
3
+ Version: 2.0rc7
4
4
  Summary: A lightweight library for operations on blocksparse matrices in PyTorch.
5
5
  Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
6
6
  Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -108,12 +108,16 @@ library.
108
108
 
109
109
  ## Known Limitations and Issues
110
110
 
111
+ - Triton has a bug with `tl.atomix_max()` used for the row-wise max operation.
112
+ In order to work around this bug a manual conversion of some values is needed, (slightly) negatively impacting
113
+ performance.
114
+ Watch the [issue](https://github.com/triton-lang/triton/issues/6376) on Triton's issue tracker for more information.
111
115
  - PyTorch's `wrap_triton()` currently does not support config pruning. It thus cannot be used for some of the kernels,
112
116
  which could impact graph compilation.
113
117
  - There seem to be some issues with autocasting, forcing some operations to manually cast.
114
118
  - There will be some slight numerical differences between vanilla and blksprs operations.
115
- These instabilities are due to Triton and thus cannot be fixed by this library alone.
116
- However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
119
+ These instabilities are due to Triton and thus cannot be fixed by this library alone.
120
+ However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
117
121
 
118
122
  ## Usage
119
123
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "blksprs"
3
- version = "2.0-rc.6"
3
+ version = "2.0-rc.7"
4
4
  authors = [{ name = "Felix Schön", email = "schoen@kr.tuwien.ac.at" }]
5
5
  description = "A lightweight library for operations on blocksparse matrices in PyTorch."
6
6
  readme = "README.md"
File without changes
File without changes
File without changes
File without changes
File without changes