blksprs 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
blksprs/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from blksprs.utils.blksprs_tensor import BlksprsTensor
2
2
 
3
- __version__ = "2.1.4"
3
+ __version__ = "2.1.6"
4
4
 
5
5
 
6
6
  class ops:
@@ -27,9 +27,9 @@ class utils:
27
27
  from blksprs.utils.processing import apply_torch_linear, apply_torch_normalisation, apply_torch_dropout, \
28
28
  apply_function_applicable_row_wise
29
29
  from blksprs.utils.tools import do_shape_blocksparse, undo_shape_blocksparse
30
+ from blksprs.utils.validation import disable_contiguous, disable_validation
30
31
 
31
32
  class validation:
32
- from blksprs.utils.validation import disable_validation
33
33
  from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_dtype_float, \
34
34
  validate_dtype_int, validate_device, validate_sparsity, validate_sparsity_dense, \
35
35
  validate_sparsity_block_size
blksprs/ops/conversion.py CHANGED
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
9
9
  from blksprs.utils.tools import stride
10
10
  from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
11
11
  from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
12
- validate_sparsity, validate_sparsity_block_size, validate_sparsity_dense
12
+ validate_sparsity, validate_sparsity_block_size, validate_sparsity_dense, ensure_contiguous
13
13
 
14
14
 
15
15
  def to_blksprs(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int) -> BlksprsTensor:
@@ -35,7 +35,7 @@ def to_sparse(x: Tensor, sparsity_layout: Tensor,
35
35
  BlksprsTensor: The block-sparse tensor converted to compressed form.
36
36
 
37
37
  """
38
- x = x.contiguous()
38
+ x = ensure_contiguous(x)
39
39
 
40
40
  validate_dimensions(x)
41
41
  validate_contiguous(x)
@@ -106,17 +106,13 @@ def to_sparse_kernel(x,
106
106
  pid_col = tl.program_id(axis=2)
107
107
 
108
108
  # Get sparsity index of current output block consisting of its batch, row, and column index
109
- spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
110
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
111
- spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
109
+ spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
110
+ spa_val_msk = (tl.arange(0, 4) < 3)
111
+ spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
112
112
 
113
- spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
114
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
115
- spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
116
-
117
- spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
118
- spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
119
- spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
113
+ spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
114
+ spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
115
+ spa_col = tl.sum(spa_val * (tl.arange(0, 4) == 2))
120
116
 
121
117
  # Load block from dense tensor
122
118
  blk_d_idx = (spa_bat * x_b_s +
@@ -191,7 +187,7 @@ def to_dense(x: BlksprsTensor, sparsity_layout: Tensor,
191
187
  Tensor: The block-sparse tensor converted to regular form.
192
188
 
193
189
  """
194
- x = x.contiguous()
190
+ x = ensure_contiguous(x)
195
191
 
196
192
  validate_dimensions(x)
197
193
  validate_contiguous(x, sparsity_layout)
@@ -339,7 +335,7 @@ def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_
339
335
  Tensor: The sparsity layout of the resulting output tensor.
340
336
 
341
337
  """
342
- x = x.contiguous()
338
+ x = ensure_contiguous(x)
343
339
 
344
340
  validate_dimensions(x)
345
341
  validate_contiguous(x, sparsity_layout_from)
@@ -445,17 +441,13 @@ def adapt_layout_kernel(x,
445
441
  pid_col = tl.program_id(axis=2)
446
442
 
447
443
  # Get position of current sparsity block consisting of its batch, row, and column index
448
- spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
449
- spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
450
- spa_bat_o = tl.load(s_lut_o + spa_bat_o_idx, mask=spa_bat_o_msk)
451
-
452
- spa_row_o_idx = (pid_blk * s_lut_o_r_s + 1 * s_lut_o_c_s)
453
- spa_row_o_msk = (spa_row_o_idx >= 0 and spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
454
- spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
444
+ spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
445
+ spa_val_msk = (tl.arange(0, 4) < 3)
446
+ spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
455
447
 
456
- spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
457
- spa_col_o_msk = (spa_col_o_idx >= 0 and spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
458
- spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
448
+ spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
449
+ spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
450
+ spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
459
451
 
460
452
  # Get equivalent sparsity block in from layout
461
453
  spa_bat_x = spa_bat_o
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
9
9
  from blksprs.utils.tools import stride
10
10
  from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
11
11
  from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
12
- validate_sparsity, validate_dtype_int, validate_sparsity_block_size
12
+ validate_sparsity, validate_dtype_int, validate_sparsity_block_size, ensure_contiguous
13
13
 
14
14
 
15
15
  @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -32,8 +32,7 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
32
32
  BlksprsTensor: The result of the gather operation as a block-sparse tensor in compressed form.
33
33
 
34
34
  """
35
- src = src.contiguous()
36
- idx = idx.contiguous()
35
+ src, idx = ensure_contiguous(src, idx)
37
36
 
38
37
  validate_dimensions(src, idx)
39
38
  validate_contiguous(src, idx)
@@ -125,17 +124,13 @@ def gather_kernel(x,
125
124
  pid_col = tl.program_id(axis=2)
126
125
 
127
126
  # Get position of current sparsity block consisting of its batch, row, and column index
128
- spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
129
- spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
130
- spa_bat_o = tl.load(s_lut_o + spa_bat_o_idx, mask=spa_bat_o_msk)
127
+ spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
128
+ spa_val_msk = (tl.arange(0, 4) < 3)
129
+ spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
131
130
 
132
- spa_row_o_idx = (pid_blk * s_lut_o_r_s + 1 * s_lut_o_c_s)
133
- spa_row_o_msk = (spa_row_o_idx >= 0 and spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
134
- spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
135
-
136
- spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
137
- spa_col_o_msk = (spa_col_o_idx >= 0 and spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
138
- spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
131
+ spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
132
+ spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
133
+ spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
139
134
 
140
135
  # Load index values
141
136
  blk_i_idx = ((pid_blk * i_b_s) +
@@ -265,8 +260,7 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
265
260
  BlksprsTensor: The result of the scatter operation as a block-sparse tensor in compressed form.
266
261
 
267
262
  """
268
- src = src.contiguous()
269
- idx = idx.contiguous()
263
+ src, idx = ensure_contiguous(src, idx)
270
264
 
271
265
  validate_dimensions(src, idx)
272
266
  validate_contiguous(src, idx)
@@ -374,17 +368,13 @@ def scatter_reduce_kernel(x,
374
368
  pid_col = tl.program_id(axis=2)
375
369
 
376
370
  # Get position of current sparsity block consisting of its batch, row, and column index
377
- spa_bat_x_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
378
- spa_bat_x_msk = (spa_bat_x_idx >= 0 and spa_bat_x_idx < s_lut_x_r * s_lut_x_r_s)
379
- spa_bat_x = tl.load(s_lut_x + spa_bat_x_idx, mask=spa_bat_x_msk)
380
-
381
- spa_row_x_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
382
- spa_row_x_msk = (spa_row_x_idx >= 0 and spa_row_x_idx < s_lut_x_r * s_lut_x_r_s)
383
- spa_row_x = tl.load(s_lut_x + spa_row_x_idx, mask=spa_row_x_msk)
371
+ spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
372
+ spa_val_msk = (tl.arange(0, 4) < 3)
373
+ spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
384
374
 
385
- spa_col_x_idx = (pid_blk * s_lut_x_r_s + 2 * s_lut_x_c_s)
386
- spa_col_x_msk = (spa_col_x_idx >= 0 and spa_col_x_idx < s_lut_x_r * s_lut_x_r_s)
387
- spa_col_x = tl.load(s_lut_x + spa_col_x_idx, mask=spa_col_x_msk)
375
+ spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
376
+ spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
377
+ spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
388
378
 
389
379
  # Load x values
390
380
  blk_x_idx = ((pid_blk * x_b_s) +
blksprs/ops/flow.py CHANGED
@@ -66,17 +66,13 @@ def flow_pull_kernel(x,
66
66
  pid_col = tl.program_id(axis=2)
67
67
 
68
68
  # Get sparsity index of current output block consisting of its batch, row, and column index
69
- spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
70
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
71
- spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
69
+ spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
70
+ spa_val_msk = (tl.arange(0, 4) < 3)
71
+ spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
72
72
 
73
- spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
74
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
75
- spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
76
-
77
- spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
78
- spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
79
- spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
73
+ spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
74
+ spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
75
+ spa_col = tl.sum(spa_val * (tl.arange(0, 4) == 2))
80
76
 
81
77
  # Load reverse sparsity index
82
78
  rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
@@ -157,17 +153,13 @@ def flow_push_kernel(x,
157
153
  pid_col = tl.program_id(axis=2)
158
154
 
159
155
  # Get sparsity index of current input block consisting of its batch, row, and column index
160
- spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
161
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
162
- spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
163
-
164
- spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
165
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
166
- spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
156
+ spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
157
+ spa_val_msk = (tl.arange(0, 4) < 3)
158
+ spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
167
159
 
168
- spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
169
- spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
170
- spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
160
+ spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
161
+ spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
162
+ spa_col = tl.sum(spa_val * (tl.arange(0, 4) == 2))
171
163
 
172
164
  # Get reverse sparsity index
173
165
  rev_idx_spa_idx = (spa_bat * s_l_x_b_s +
blksprs/ops/matmul.py CHANGED
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
9
9
  from blksprs.utils.tools import stride
10
10
  from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
11
11
  from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
12
- validate_sparsity, validate_sparsity_block_size, validate_dtype_float
12
+ validate_sparsity, validate_sparsity_block_size, validate_dtype_float, ensure_contiguous
13
13
 
14
14
 
15
15
  @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -34,8 +34,7 @@ def matmul(x: BlksprsTensor, sparsity_layout_x: Tensor,
34
34
  BlksprsTensor: The result of the matrix multiplication as a block-sparse tensor in compressed form.
35
35
 
36
36
  """
37
- x = x.contiguous()
38
- y = y.contiguous()
37
+ x, y = ensure_contiguous(x, y)
39
38
 
40
39
  validate_dimensions(x, y)
41
40
  validate_contiguous(x, y)
@@ -145,17 +144,13 @@ def matmul_kernel(x,
145
144
  pid_col = tl.program_id(axis=2)
146
145
 
147
146
  # Get position of current sparsity block consisting of its batch, row, and column index
148
- spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
149
- spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
150
- spa_bat_o = tl.load(s_lut_o + spa_bat_o_idx, mask=spa_bat_o_msk)
147
+ spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
148
+ spa_val_msk = (tl.arange(0, 4) < 3)
149
+ spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
151
150
 
152
- spa_row_o_idx = (pid_blk * s_lut_o_r_s + 1 * s_lut_o_c_s)
153
- spa_row_o_msk = (spa_row_o_idx >= 0 and spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
154
- spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
155
-
156
- spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
157
- spa_col_o_msk = (spa_col_o_idx >= 0 and spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
158
- spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
151
+ spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
152
+ spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
153
+ spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
159
154
 
160
155
  # Setup buffer
161
156
  buf = tl.zeros(shape=(TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), dtype=tl.float32)
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
9
9
  from blksprs.utils.tools import stride
10
10
  from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
11
11
  from blksprs.utils.validation import validate_contiguous, validate_device, \
12
- validate_sparsity_block_size
12
+ validate_sparsity_block_size, ensure_contiguous
13
13
 
14
14
 
15
15
  @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -29,8 +29,7 @@ def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
29
29
  output tensor corresponds to x(i) + y(j).
30
30
 
31
31
  """
32
- x = x.contiguous()
33
- y = y.contiguous()
32
+ x, y = ensure_contiguous(x, y)
34
33
 
35
34
  validate_device(x, y)
36
35
  validate_contiguous(x, y)
@@ -110,17 +109,13 @@ def broadcast_add_kernel(x,
110
109
  pid_col = tl.program_id(axis=2)
111
110
 
112
111
  # Get position of current sparsity block consisting of its batch, row, and column index
113
- spa_bat_o_idx = (pid_blk * s_lut_o_r_s + 0 * s_lut_o_c_s)
114
- spa_bat_o_msk = (spa_bat_o_idx >= 0 and spa_bat_o_idx < s_lut_o_r * s_lut_o_r_s)
115
- spa_bat_o = tl.load(s_lut_o + spa_bat_o_idx, mask=spa_bat_o_msk)
112
+ spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
113
+ spa_val_msk = (tl.arange(0, 4) < 3)
114
+ spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
116
115
 
117
- spa_row_o_idx = (pid_blk * s_lut_o_r_s + 1 * s_lut_o_c_s)
118
- spa_row_o_msk = (spa_row_o_idx >= 0 and spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
119
- spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
120
-
121
- spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
122
- spa_col_o_msk = (spa_col_o_idx >= 0 and spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
123
- spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
116
+ spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
117
+ spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
118
+ spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
124
119
 
125
120
  # Load x block
126
121
  blk_x_idx = (spa_bat_o * x_b_s +
@@ -8,7 +8,7 @@ from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_config
8
8
  from blksprs.utils.blksprs_tensor import BlksprsTensor
9
9
  from blksprs.utils.tools import stride
10
10
  from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, validate_sparsity, \
11
- validate_sparsity_block_size
11
+ validate_sparsity_block_size, ensure_contiguous
12
12
 
13
13
 
14
14
  @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
@@ -34,7 +34,7 @@ def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
34
34
  of the input and the sparsity layout of the output tensor.
35
35
 
36
36
  """
37
- x = x.contiguous()
37
+ x = ensure_contiguous(x)
38
38
 
39
39
  validate_dimensions(x)
40
40
  validate_contiguous(x)
@@ -119,17 +119,17 @@ def row_wise_sum_kernel(x,
119
119
  pid_col = tl.program_id(axis=2)
120
120
 
121
121
  # Get position of current sparsity block consisting of its batch and row index
122
- spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
123
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
124
- spa_bat = tl.load(s_lut_x + spa_bat_idx, mask=spa_bat_msk)
122
+ spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
123
+ spa_val_msk = (tl.arange(0, 4) < 3)
124
+ spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
125
125
 
126
- spa_row_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
127
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_x_r * s_lut_x_r_s)
128
- spa_row = tl.load(s_lut_x + spa_row_idx, mask=spa_row_msk)
126
+ spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
127
+ spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
128
+ spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
129
129
 
130
130
  # Load reverse sparsity index for current block
131
- rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
132
- spa_row * s_l_o_r_s)
131
+ rev_idx_spa_idx = (spa_bat_x * s_l_o_b_s +
132
+ spa_row_x * s_l_o_r_s)
133
133
  rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
134
134
  rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
135
135
 
@@ -176,7 +176,7 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
176
176
  """
177
177
  # TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376, should be fixed with the upcoming 3.4.0 release
178
178
  x = torch.where(x == -0.0, torch.tensor(0.0), x)
179
- x = x.contiguous()
179
+ x = ensure_contiguous(x)
180
180
 
181
181
  validate_dimensions(x)
182
182
  validate_contiguous(x)
@@ -263,17 +263,17 @@ def row_wise_max_kernel(x,
263
263
  pid_col = tl.program_id(axis=2)
264
264
 
265
265
  # Get position of current sparsity block consisting of its batch and row index
266
- spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
267
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
268
- spa_bat = tl.load(s_lut_x + spa_bat_idx, mask=spa_bat_msk)
266
+ spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
267
+ spa_val_msk = (tl.arange(0, 4) < 3)
268
+ spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
269
269
 
270
- spa_row_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
271
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_x_r * s_lut_x_r_s)
272
- spa_row = tl.load(s_lut_x + spa_row_idx, mask=spa_row_msk)
270
+ spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
271
+ spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
272
+ spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
273
273
 
274
274
  # Load reverse sparsity index for current block
275
- rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
276
- spa_row * s_l_o_r_s)
275
+ rev_idx_spa_idx = (spa_bat_x * s_l_o_b_s +
276
+ spa_row_x * s_l_o_r_s)
277
277
  rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
278
278
  rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
279
279
 
@@ -311,6 +311,8 @@ def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
311
311
  compressed form.
312
312
 
313
313
  """
314
+ x = ensure_contiguous(x)
315
+
314
316
  validate_dimensions(x)
315
317
  validate_contiguous(x)
316
318
  validate_device(x)
@@ -361,7 +363,7 @@ def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
361
363
  triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
362
364
  triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
363
365
 
364
- (wrap_triton(kernel_blocksparse_row_wise_add)[triton_grid]
366
+ (wrap_triton(row_wise_add_kernel)[triton_grid]
365
367
  (x,
366
368
  x_b, x_b_s, x_r_s, x_c_s,
367
369
  sparsity_lut_x, s_lut_r, s_lut_r_s, s_lut_c_s,
@@ -383,33 +385,33 @@ def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
383
385
  reset_to_zero=["o"]
384
386
  )
385
387
  @triton.jit
386
- def kernel_blocksparse_row_wise_add(x,
387
- x_b, x_b_s, x_r_s, x_c_s,
388
- s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
389
- y, y_b, y_b_s, y_r_s, y_c_s,
390
- s_l_y_b, s_l_y_b_s, s_l_y_r_s,
391
- r_lut_y,
392
- o,
393
- o_b, o_b_s, o_r_s, o_c_s,
394
- sparsity_block_size,
395
- TRITON_BLOCK_SIZE: tl.constexpr) -> None:
388
+ def row_wise_add_kernel(x,
389
+ x_b, x_b_s, x_r_s, x_c_s,
390
+ s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
391
+ y, y_b, y_b_s, y_r_s, y_c_s,
392
+ s_l_y_b, s_l_y_b_s, s_l_y_r_s,
393
+ r_lut_y,
394
+ o,
395
+ o_b, o_b_s, o_r_s, o_c_s,
396
+ sparsity_block_size,
397
+ TRITON_BLOCK_SIZE: tl.constexpr) -> None:
396
398
  # Get triton block indices
397
399
  pid_blk = tl.program_id(axis=0)
398
400
  pid_row = tl.program_id(axis=1)
399
401
  pid_col = tl.program_id(axis=2)
400
402
 
401
403
  # Get position of current sparsity block consisting of its batch and row index
402
- spa_bat_idx = (pid_blk * s_lut_x_r_s + 0 * s_lut_x_c_s)
403
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_x_r * s_lut_x_r_s)
404
- spa_bat = tl.load(s_lut_x + spa_bat_idx, mask=spa_bat_msk)
404
+ spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
405
+ spa_val_msk = (tl.arange(0, 4) < 3)
406
+ spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
405
407
 
406
- spa_row_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
407
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_x_r * s_lut_x_r_s)
408
- spa_row = tl.load(s_lut_x + spa_row_idx, mask=spa_row_msk)
408
+ spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
409
+ spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
410
+ spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
409
411
 
410
412
  # Get reverse sparsity indices for s
411
- rev_idx_spa_s_idx = (spa_bat * s_l_y_b_s +
412
- spa_row * s_l_y_r_s)
413
+ rev_idx_spa_s_idx = (spa_bat_x * s_l_y_b_s +
414
+ spa_row_x * s_l_y_r_s)
413
415
  rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_y_b * s_l_y_b_s)
414
416
  rev_idx_spa_s = tl.load(r_lut_y + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
415
417
 
@@ -5,7 +5,7 @@ from torch._library import triton_op
5
5
  from blksprs.ops.flow import flow_pull_forward
6
6
  from blksprs.utils.blksprs_tensor import BlksprsTensor
7
7
  from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
8
- validate_sparsity, validate_sparsity_block_size
8
+ validate_sparsity, validate_sparsity_block_size, ensure_contiguous
9
9
 
10
10
 
11
11
  @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -27,7 +27,7 @@ def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
27
27
  Tensor: The sparsity layout of the output tensor.
28
28
 
29
29
  """
30
- x = x.contiguous()
30
+ x = ensure_contiguous(x)
31
31
 
32
32
  validate_dimensions(x)
33
33
  validate_contiguous(x)
@@ -132,7 +132,7 @@ def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
132
132
  Tensor: The sparsity layout of the output tensor.
133
133
 
134
134
  """
135
- x = x.contiguous()
135
+ x = ensure_contiguous(x)
136
136
 
137
137
  validate_dimensions(x)
138
138
  validate_contiguous(x)
blksprs/ops/repeat.py CHANGED
@@ -5,7 +5,7 @@ from torch._library import triton_op
5
5
  from blksprs.ops.flow import flow_pull_forward, flow_push_forward
6
6
  from blksprs.utils.blksprs_tensor import BlksprsTensor
7
7
  from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
8
- validate_sparsity, validate_sparsity_block_size
8
+ validate_sparsity, validate_sparsity_block_size, ensure_contiguous
9
9
 
10
10
 
11
11
  @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -36,7 +36,7 @@ def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int,
36
36
  Tensor: The sparsity layout of the resulting output tensor.
37
37
 
38
38
  """
39
- x = x.contiguous()
39
+ x = ensure_contiguous(x)
40
40
 
41
41
  validate_dimensions(x)
42
42
  validate_contiguous(x)
@@ -77,7 +77,7 @@ def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
77
77
  Tensor: The sparsity layout of the resulting output tensor.
78
78
 
79
79
  """
80
- x = x.contiguous()
80
+ x = ensure_contiguous(x)
81
81
 
82
82
  validate_dimensions(x)
83
83
  validate_contiguous(x)
@@ -142,7 +142,7 @@ def repeat_build_lut(lut: dict, sparsity_layout_x: Tensor, repeats: tuple[int, i
142
142
  n_sparse_blocks = torch.sum(lut["sparsity_layout_o"].to(torch.int)).item()
143
143
  lut["n_sparse_blocks"] = n_sparse_blocks
144
144
 
145
- validate_contiguous(sparsity_layout_o, lut["sparsity_lut"], lut["sparsity_reverse_lut"])
145
+ validate_contiguous(lut["sparsity_layout_o"], lut["sparsity_lut"], lut["sparsity_reverse_lut"])
146
146
 
147
147
  return lut
148
148
 
@@ -178,7 +178,7 @@ def repeat_interleave_build_lut(lut: dict, sparsity_layout_x: Tensor, repeats: i
178
178
  n_sparse_blocks = torch.sum(lut["sparsity_layout_o"].to(torch.int)).item()
179
179
  lut["n_sparse_blocks"] = n_sparse_blocks
180
180
 
181
- validate_contiguous(sparsity_layout_o, lut["sparsity_lut"], lut["sparsity_reverse_lut"])
181
+ validate_contiguous(lut["sparsity_layout_o"], lut["sparsity_lut"], lut["sparsity_reverse_lut"])
182
182
 
183
183
  return lut
184
184
 
blksprs/ops/softmax.py CHANGED
@@ -12,7 +12,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
12
12
  from blksprs.utils.tools import stride, ceil_pow2
13
13
  from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
14
14
  from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
15
- validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32
15
+ validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32, ensure_contiguous
16
16
 
17
17
 
18
18
  def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int, flag_fused: bool = True,
@@ -44,7 +44,7 @@ def softmax_regular(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_si
44
44
  BlksprsTensor: The result of the softmax operation as a block-sparse tensor in compressed form.
45
45
 
46
46
  """
47
- x = x.contiguous()
47
+ x = ensure_contiguous(x)
48
48
 
49
49
  validate_dimensions(x)
50
50
  validate_contiguous(x)
@@ -176,13 +176,12 @@ def softmax_kernel(x,
176
176
  pid_col = tl.program_id(axis=2)
177
177
 
178
178
  # Get position of current sparsity block consisting of its batch and row index
179
- spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
180
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
181
- spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
179
+ spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
180
+ spa_val_msk = (tl.arange(0, 4) < 3)
181
+ spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
182
182
 
183
- spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
184
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
185
- spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
183
+ spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
184
+ spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
186
185
 
187
186
  # Get reverse sparsity indices for s
188
187
  rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
@@ -241,13 +240,12 @@ def softmax_kernel_grad(g,
241
240
  pid_col = tl.program_id(axis=2)
242
241
 
243
242
  # Get position of current sparsity block consisting of its batch and row index
244
- spa_bat_idx = (pid_blk * s_lut_r_s + 0 * s_lut_c_s)
245
- spa_bat_msk = (spa_bat_idx >= 0 and spa_bat_idx < s_lut_r * s_lut_r_s)
246
- spa_bat = tl.load(s_lut + spa_bat_idx, mask=spa_bat_msk)
243
+ spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
244
+ spa_val_msk = (tl.arange(0, 4) < 3)
245
+ spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
247
246
 
248
- spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
249
- spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
250
- spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
247
+ spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
248
+ spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
251
249
 
252
250
  rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
253
251
  spa_row * s_l_s_r_s)
@@ -337,7 +335,7 @@ def softmax_fused(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size
337
335
  BlksprsTensor: The result of the softmax operation as a block-sparse tensor in compressed form.
338
336
 
339
337
  """
340
- x = x.contiguous()
338
+ x = ensure_contiguous(x)
341
339
 
342
340
  validate_dimensions(x)
343
341
  validate_contiguous(x)
blksprs/ops/transpose.py CHANGED
@@ -5,7 +5,7 @@ from torch._library import triton_op
5
5
  from blksprs.ops.flow import flow_pull_forward
6
6
  from blksprs.utils.blksprs_tensor import BlksprsTensor
7
7
  from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
8
- validate_sparsity, validate_sparsity_block_size
8
+ validate_sparsity, validate_sparsity_block_size, ensure_contiguous
9
9
 
10
10
 
11
11
  @torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
@@ -27,7 +27,7 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor,
27
27
  Tensor: The sparsity layout of the transposed tensor.
28
28
 
29
29
  """
30
- x = x.contiguous()
30
+ x = ensure_contiguous(x)
31
31
 
32
32
  validate_dimensions(x)
33
33
  validate_contiguous(x)
@@ -14,11 +14,11 @@ if blksprs_autotune_mode == "DEFAULT":
14
14
 
15
15
  (64, 3, 8),
16
16
  (64, 4, 4),
17
- (64, 5, 2),
17
+ (64, 4, 8),
18
18
 
19
19
  (128, 3, 8),
20
20
  (128, 4, 4),
21
- (128, 5, 2),
21
+ (128, 4, 8),
22
22
  ]
23
23
  elif blksprs_autotune_mode == "TEST":
24
24
  autotune_parameters = [
@@ -1,9 +1,18 @@
1
1
  import torch
2
2
  from torch import Tensor
3
3
 
4
+ CONTIGUOUS = True
4
5
  VALIDATION = True
5
6
 
6
7
 
8
+ def ensure_contiguous(*tensors: Tensor) -> tuple[Tensor, ...]:
9
+ if _check_skip_contiguous():
10
+ return tensors
11
+
12
+ transformed = tuple(tensor.contiguous() for tensor in tensors)
13
+ return transformed[0] if len(transformed) == 1 else transformed
14
+
15
+
7
16
  def validate_dimensions(*tensors: Tensor, dims=3) -> None:
8
17
  if _check_skip_validation():
9
18
  return
@@ -124,6 +133,19 @@ def validate_sparsity_block_size(sparsity_block_size: int, *tensors):
124
133
  raise ValueError("Tensor sizes must be divisible by sparsity block size")
125
134
 
126
135
 
136
+ def _check_skip_contiguous():
137
+ return not CONTIGUOUS
138
+
139
+
140
+ def _set_skip_contiguous(skip_contiguous: bool):
141
+ global CONTIGUOUS
142
+ CONTIGUOUS = not skip_contiguous
143
+
144
+
145
+ def disable_contiguous():
146
+ _set_skip_contiguous(True)
147
+
148
+
127
149
  def _check_skip_validation():
128
150
  return not VALIDATION
129
151
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: blksprs
3
- Version: 2.1.4
3
+ Version: 2.1.6
4
4
  Summary: A lightweight library for operations on block-sparse matrices in PyTorch.
5
5
  Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
6
6
  Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -0,0 +1,23 @@
1
+ blksprs/__init__.py,sha256=F3JKF_GqJrXYtM31r9eT8kSamz8I8bav-6lbQUiLHfA,1631
2
+ blksprs/layouting/distribution_layout.py,sha256=ur1ty_2U-Hfj78hMWsLZvu7ZuGhzW3qGLKMc72DfTZM,5861
3
+ blksprs/layouting/sparsity_layout.py,sha256=eXHmu2h7K5Q-YUpfOxocJoeP_5ZoQFZf_eHLxRZQbYU,11207
4
+ blksprs/ops/conversion.py,sha256=NcBxWWWzMkjQx_fEfh14RWt688X6J82FzDqByAd3Pj4,21405
5
+ blksprs/ops/distribution.py,sha256=pabgyw0m3A4A0osfnOoKffk-b2BKXCn-lC6BU26ocKY,20180
6
+ blksprs/ops/flow.py,sha256=JEGES5ZbMqxR02rwi2Ym4j3VDxkcRxhFO1f-5nNUlM8,7760
7
+ blksprs/ops/matmul.py,sha256=9XPsKbYBw0cdmZY6i4T3Phbx00LXIuA6KI0EIcyGo9U,11584
8
+ blksprs/ops/partitioning.py,sha256=67_a9a5ZpsRmB4BVTOks0stFWp34cb0nk28zQFkXEZc,9985
9
+ blksprs/ops/repeat.py,sha256=Eo7L-TcrrXb_I6xKXLVklp1EuCuA0sfhPaOzw_8y1eU,9080
10
+ blksprs/ops/softmax.py,sha256=YcoZpdC1BdL4zKRQOSjIRtfGgDoQvUZabgNmjbeY8-4,23470
11
+ blksprs/ops/transpose.py,sha256=AyIPuiMAtUAPJPs9eK-Apz6vjZdmnJO9RF6_yH6u6Fk,4097
12
+ blksprs/ops/misc/broadcast_ops.py,sha256=ro7K2ZMOsscxNEp2HY_6efqJ4Wrf-QCFL4NLeDqvah8,5692
13
+ blksprs/ops/misc/row_wise.py,sha256=dfhuXexyFBaNvfZjOt9w3s29ih19JhWIy04_FhUnHgk,19420
14
+ blksprs/utils/autotuning.py,sha256=xalNP3sWdRn8XiVG4jE1-_iy2QhUmIJvTGM83YwgKA0,2052
15
+ blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
16
+ blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
17
+ blksprs/utils/processing.py,sha256=RNkEDc0g-sNHRuMPkRzNWU13d3_lIkXMJdoqES4yQTM,3738
18
+ blksprs/utils/tools.py,sha256=TKygEKge4wJtJnXXDg8BTL8vzBpqIJsQ_A3_5FmLpcE,859
19
+ blksprs/utils/validation.py,sha256=XmDMAVSg7SHd7KZswFGU_2kshSTWe0dI6yB5iSGj6cQ,4850
20
+ blksprs-2.1.6.dist-info/METADATA,sha256=jALZxAvt1JAvlQc219KI5mRqHsCq624d0P8LzyLxe9Q,9590
21
+ blksprs-2.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
+ blksprs-2.1.6.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
23
+ blksprs-2.1.6.dist-info/RECORD,,
@@ -1,23 +0,0 @@
1
- blksprs/__init__.py,sha256=XERzTtkiElDeBppOO8rNrF6OktUQf_yozDiA4DUXqTY,1615
2
- blksprs/layouting/distribution_layout.py,sha256=ur1ty_2U-Hfj78hMWsLZvu7ZuGhzW3qGLKMc72DfTZM,5861
3
- blksprs/layouting/sparsity_layout.py,sha256=eXHmu2h7K5Q-YUpfOxocJoeP_5ZoQFZf_eHLxRZQbYU,11207
4
- blksprs/ops/conversion.py,sha256=nv5gXiyZkUtk1kCIlPr0Vpaj4G8G6dJdW7StlbV3nDw,21914
5
- blksprs/ops/distribution.py,sha256=0tPldv0ARzmCV1CU2jvfqpHBgOuHPrDFiCtqsLs7CZc,20789
6
- blksprs/ops/flow.py,sha256=oUn_xDT74220-EmnBnB8bRNtbS1mjbxWpm76PFsK22o,8246
7
- blksprs/ops/matmul.py,sha256=ES9bpiCIRBxaynNIL5ftDP0c9LSArbj8YJqkPEzBaIU,11879
8
- blksprs/ops/partitioning.py,sha256=cfQmY9BZqGTvvJorIhtb-EyuGRJGPraWR-wTKdb47aI,9954
9
- blksprs/ops/repeat.py,sha256=TLYNxwPuT9y5K9xyM41WK5gnggAJF3lI61Q2K7zWjns,9035
10
- blksprs/ops/softmax.py,sha256=tfC_jaAKrA956rxGeb57klMuYRKTiyMCd5Zg5DIH3fc,23649
11
- blksprs/ops/transpose.py,sha256=U-VAyLRT6_NDv9qYSFzBqfVlDeIpTqAMEXkqto0VF6w,4072
12
- blksprs/ops/misc/broadcast_ops.py,sha256=-PrHiSJikZh8nXUmXxSCtFEP27TTxFr4wcrNxBjnimk,5987
13
- blksprs/ops/misc/row_wise.py,sha256=n5FJjAuOd8BHBJQx4bsQwr-HmXkR9PYVAqfk77wjOFU,19653
14
- blksprs/utils/autotuning.py,sha256=a-kmWRjJ3eED2XbjkQeOJSyW8bdIs27HgKMPvAKqWeU,2052
15
- blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
16
- blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
17
- blksprs/utils/processing.py,sha256=RNkEDc0g-sNHRuMPkRzNWU13d3_lIkXMJdoqES4yQTM,3738
18
- blksprs/utils/tools.py,sha256=TKygEKge4wJtJnXXDg8BTL8vzBpqIJsQ_A3_5FmLpcE,859
19
- blksprs/utils/validation.py,sha256=G8eQlvJVMKfEX3k2AwBD0A6Ck-gFoRLpLNY6HXsB3fA,4348
20
- blksprs-2.1.4.dist-info/METADATA,sha256=qGLQunHEIoHlmRvFnM0TVDjOSApwGzBglpZezmfhHLU,9590
21
- blksprs-2.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
22
- blksprs-2.1.4.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
23
- blksprs-2.1.4.dist-info/RECORD,,