blksprs 2.1.4__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- blksprs/__init__.py +2 -2
- blksprs/ops/conversion.py +16 -24
- blksprs/ops/distribution.py +15 -25
- blksprs/ops/flow.py +12 -20
- blksprs/ops/matmul.py +8 -13
- blksprs/ops/misc/broadcast_ops.py +8 -13
- blksprs/ops/misc/row_wise.py +40 -38
- blksprs/ops/partitioning.py +3 -3
- blksprs/ops/repeat.py +5 -5
- blksprs/ops/softmax.py +13 -15
- blksprs/ops/transpose.py +2 -2
- blksprs/utils/autotuning.py +2 -2
- blksprs/utils/validation.py +22 -0
- {blksprs-2.1.4.dist-info → blksprs-2.1.6.dist-info}/METADATA +1 -1
- blksprs-2.1.6.dist-info/RECORD +23 -0
- blksprs-2.1.4.dist-info/RECORD +0 -23
- {blksprs-2.1.4.dist-info → blksprs-2.1.6.dist-info}/WHEEL +0 -0
- {blksprs-2.1.4.dist-info → blksprs-2.1.6.dist-info}/top_level.txt +0 -0
blksprs/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
2
2
|
|
|
3
|
-
__version__ = "2.1.
|
|
3
|
+
__version__ = "2.1.6"
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class ops:
|
|
@@ -27,9 +27,9 @@ class utils:
|
|
|
27
27
|
from blksprs.utils.processing import apply_torch_linear, apply_torch_normalisation, apply_torch_dropout, \
|
|
28
28
|
apply_function_applicable_row_wise
|
|
29
29
|
from blksprs.utils.tools import do_shape_blocksparse, undo_shape_blocksparse
|
|
30
|
+
from blksprs.utils.validation import disable_contiguous, disable_validation
|
|
30
31
|
|
|
31
32
|
class validation:
|
|
32
|
-
from blksprs.utils.validation import disable_validation
|
|
33
33
|
from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_dtype_float, \
|
|
34
34
|
validate_dtype_int, validate_device, validate_sparsity, validate_sparsity_dense, \
|
|
35
35
|
validate_sparsity_block_size
|
blksprs/ops/conversion.py
CHANGED
|
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
|
9
9
|
from blksprs.utils.tools import stride
|
|
10
10
|
from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs, prune_autotune_configs_conversion
|
|
11
11
|
from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
|
|
12
|
-
validate_sparsity, validate_sparsity_block_size, validate_sparsity_dense
|
|
12
|
+
validate_sparsity, validate_sparsity_block_size, validate_sparsity_dense, ensure_contiguous
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def to_blksprs(x: Tensor, sparsity_layout: Tensor, sparsity_block_size: int) -> BlksprsTensor:
|
|
@@ -35,7 +35,7 @@ def to_sparse(x: Tensor, sparsity_layout: Tensor,
|
|
|
35
35
|
BlksprsTensor: The block-sparse tensor converted to compressed form.
|
|
36
36
|
|
|
37
37
|
"""
|
|
38
|
-
x = x
|
|
38
|
+
x = ensure_contiguous(x)
|
|
39
39
|
|
|
40
40
|
validate_dimensions(x)
|
|
41
41
|
validate_contiguous(x)
|
|
@@ -106,17 +106,13 @@ def to_sparse_kernel(x,
|
|
|
106
106
|
pid_col = tl.program_id(axis=2)
|
|
107
107
|
|
|
108
108
|
# Get sparsity index of current output block consisting of its batch, row, and column index
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
109
|
+
spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
|
|
110
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
111
|
+
spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
|
|
112
112
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
|
|
118
|
-
spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
|
|
119
|
-
spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
|
|
113
|
+
spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
114
|
+
spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
115
|
+
spa_col = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
120
116
|
|
|
121
117
|
# Load block from dense tensor
|
|
122
118
|
blk_d_idx = (spa_bat * x_b_s +
|
|
@@ -191,7 +187,7 @@ def to_dense(x: BlksprsTensor, sparsity_layout: Tensor,
|
|
|
191
187
|
Tensor: The block-sparse tensor converted to regular form.
|
|
192
188
|
|
|
193
189
|
"""
|
|
194
|
-
x = x
|
|
190
|
+
x = ensure_contiguous(x)
|
|
195
191
|
|
|
196
192
|
validate_dimensions(x)
|
|
197
193
|
validate_contiguous(x, sparsity_layout)
|
|
@@ -339,7 +335,7 @@ def adapt_layout(x: BlksprsTensor, sparsity_layout_from: Tensor, sparsity_block_
|
|
|
339
335
|
Tensor: The sparsity layout of the resulting output tensor.
|
|
340
336
|
|
|
341
337
|
"""
|
|
342
|
-
x = x
|
|
338
|
+
x = ensure_contiguous(x)
|
|
343
339
|
|
|
344
340
|
validate_dimensions(x)
|
|
345
341
|
validate_contiguous(x, sparsity_layout_from)
|
|
@@ -445,17 +441,13 @@ def adapt_layout_kernel(x,
|
|
|
445
441
|
pid_col = tl.program_id(axis=2)
|
|
446
442
|
|
|
447
443
|
# Get position of current sparsity block consisting of its batch, row, and column index
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
spa_row_o_idx = (pid_blk * s_lut_o_r_s + 1 * s_lut_o_c_s)
|
|
453
|
-
spa_row_o_msk = (spa_row_o_idx >= 0 and spa_row_o_idx < s_lut_o_r * s_lut_o_r_s)
|
|
454
|
-
spa_row_o = tl.load(s_lut_o + spa_row_o_idx, mask=spa_row_o_msk)
|
|
444
|
+
spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
|
|
445
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
446
|
+
spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
|
|
455
447
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
spa_col_o = tl.
|
|
448
|
+
spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
449
|
+
spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
450
|
+
spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
459
451
|
|
|
460
452
|
# Get equivalent sparsity block in from layout
|
|
461
453
|
spa_bat_x = spa_bat_o
|
blksprs/ops/distribution.py
CHANGED
|
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
|
9
9
|
from blksprs.utils.tools import stride
|
|
10
10
|
from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
|
|
11
11
|
from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
|
|
12
|
-
validate_sparsity, validate_dtype_int, validate_sparsity_block_size
|
|
12
|
+
validate_sparsity, validate_dtype_int, validate_sparsity_block_size, ensure_contiguous
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
|
|
@@ -32,8 +32,7 @@ def gather(src: BlksprsTensor, sparsity_layout_src: Tensor,
|
|
|
32
32
|
BlksprsTensor: The result of the gather operation as a block-sparse tensor in compressed form.
|
|
33
33
|
|
|
34
34
|
"""
|
|
35
|
-
src = src
|
|
36
|
-
idx = idx.contiguous()
|
|
35
|
+
src, idx = ensure_contiguous(src, idx)
|
|
37
36
|
|
|
38
37
|
validate_dimensions(src, idx)
|
|
39
38
|
validate_contiguous(src, idx)
|
|
@@ -125,17 +124,13 @@ def gather_kernel(x,
|
|
|
125
124
|
pid_col = tl.program_id(axis=2)
|
|
126
125
|
|
|
127
126
|
# Get position of current sparsity block consisting of its batch, row, and column index
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
127
|
+
spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
|
|
128
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
129
|
+
spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
|
|
131
130
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
|
|
137
|
-
spa_col_o_msk = (spa_col_o_idx >= 0 and spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
|
|
138
|
-
spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
|
|
131
|
+
spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
132
|
+
spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
133
|
+
spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
139
134
|
|
|
140
135
|
# Load index values
|
|
141
136
|
blk_i_idx = ((pid_blk * i_b_s) +
|
|
@@ -265,8 +260,7 @@ def scatter_reduce(src: BlksprsTensor, sparsity_layout_src: Tensor,
|
|
|
265
260
|
BlksprsTensor: The result of the scatter operation as a block-sparse tensor in compressed form.
|
|
266
261
|
|
|
267
262
|
"""
|
|
268
|
-
src = src
|
|
269
|
-
idx = idx.contiguous()
|
|
263
|
+
src, idx = ensure_contiguous(src, idx)
|
|
270
264
|
|
|
271
265
|
validate_dimensions(src, idx)
|
|
272
266
|
validate_contiguous(src, idx)
|
|
@@ -374,17 +368,13 @@ def scatter_reduce_kernel(x,
|
|
|
374
368
|
pid_col = tl.program_id(axis=2)
|
|
375
369
|
|
|
376
370
|
# Get position of current sparsity block consisting of its batch, row, and column index
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
spa_row_x_idx = (pid_blk * s_lut_x_r_s + 1 * s_lut_x_c_s)
|
|
382
|
-
spa_row_x_msk = (spa_row_x_idx >= 0 and spa_row_x_idx < s_lut_x_r * s_lut_x_r_s)
|
|
383
|
-
spa_row_x = tl.load(s_lut_x + spa_row_x_idx, mask=spa_row_x_msk)
|
|
371
|
+
spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
|
|
372
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
373
|
+
spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
|
|
384
374
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
spa_col_x = tl.
|
|
375
|
+
spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
376
|
+
spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
377
|
+
spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
388
378
|
|
|
389
379
|
# Load x values
|
|
390
380
|
blk_x_idx = ((pid_blk * x_b_s) +
|
blksprs/ops/flow.py
CHANGED
|
@@ -66,17 +66,13 @@ def flow_pull_kernel(x,
|
|
|
66
66
|
pid_col = tl.program_id(axis=2)
|
|
67
67
|
|
|
68
68
|
# Get sparsity index of current output block consisting of its batch, row, and column index
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
|
|
70
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
71
|
+
spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
spa_col_idx = (pid_blk * s_lut_r_s + 2 * s_lut_c_s)
|
|
78
|
-
spa_col_msk = (spa_col_idx >= 0 and spa_col_idx < s_lut_r * s_lut_r_s)
|
|
79
|
-
spa_col = tl.load(s_lut + spa_col_idx, mask=spa_col_msk)
|
|
73
|
+
spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
74
|
+
spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
75
|
+
spa_col = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
80
76
|
|
|
81
77
|
# Load reverse sparsity index
|
|
82
78
|
rev_idx_spa_idx = (spa_bat * s_l_o_b_s +
|
|
@@ -157,17 +153,13 @@ def flow_push_kernel(x,
|
|
|
157
153
|
pid_col = tl.program_id(axis=2)
|
|
158
154
|
|
|
159
155
|
# Get sparsity index of current input block consisting of its batch, row, and column index
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
spa_row_idx = (pid_blk * s_lut_r_s + 1 * s_lut_c_s)
|
|
165
|
-
spa_row_msk = (spa_row_idx >= 0 and spa_row_idx < s_lut_r * s_lut_r_s)
|
|
166
|
-
spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
|
|
156
|
+
spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
|
|
157
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
158
|
+
spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
|
|
167
159
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
spa_col = tl.
|
|
160
|
+
spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
161
|
+
spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
162
|
+
spa_col = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
171
163
|
|
|
172
164
|
# Get reverse sparsity index
|
|
173
165
|
rev_idx_spa_idx = (spa_bat * s_l_x_b_s +
|
blksprs/ops/matmul.py
CHANGED
|
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
|
9
9
|
from blksprs.utils.tools import stride
|
|
10
10
|
from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
|
|
11
11
|
from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
|
|
12
|
-
validate_sparsity, validate_sparsity_block_size, validate_dtype_float
|
|
12
|
+
validate_sparsity, validate_sparsity_block_size, validate_dtype_float, ensure_contiguous
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
|
|
@@ -34,8 +34,7 @@ def matmul(x: BlksprsTensor, sparsity_layout_x: Tensor,
|
|
|
34
34
|
BlksprsTensor: The result of the matrix multiplication as a block-sparse tensor in compressed form.
|
|
35
35
|
|
|
36
36
|
"""
|
|
37
|
-
x = x
|
|
38
|
-
y = y.contiguous()
|
|
37
|
+
x, y = ensure_contiguous(x, y)
|
|
39
38
|
|
|
40
39
|
validate_dimensions(x, y)
|
|
41
40
|
validate_contiguous(x, y)
|
|
@@ -145,17 +144,13 @@ def matmul_kernel(x,
|
|
|
145
144
|
pid_col = tl.program_id(axis=2)
|
|
146
145
|
|
|
147
146
|
# Get position of current sparsity block consisting of its batch, row, and column index
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
147
|
+
spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
|
|
148
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
149
|
+
spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
|
|
151
150
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
|
|
157
|
-
spa_col_o_msk = (spa_col_o_idx >= 0 and spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
|
|
158
|
-
spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
|
|
151
|
+
spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
152
|
+
spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
153
|
+
spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
159
154
|
|
|
160
155
|
# Setup buffer
|
|
161
156
|
buf = tl.zeros(shape=(TRITON_BLOCK_SIZE, TRITON_BLOCK_SIZE), dtype=tl.float32)
|
|
@@ -9,7 +9,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
|
9
9
|
from blksprs.utils.tools import stride
|
|
10
10
|
from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
|
|
11
11
|
from blksprs.utils.validation import validate_contiguous, validate_device, \
|
|
12
|
-
validate_sparsity_block_size
|
|
12
|
+
validate_sparsity_block_size, ensure_contiguous
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
|
|
@@ -29,8 +29,7 @@ def broadcast_add(x: Tensor, y: Tensor, sparsity_layout_output: Tensor,
|
|
|
29
29
|
output tensor corresponds to x(i) + y(j).
|
|
30
30
|
|
|
31
31
|
"""
|
|
32
|
-
x = x
|
|
33
|
-
y = y.contiguous()
|
|
32
|
+
x, y = ensure_contiguous(x, y)
|
|
34
33
|
|
|
35
34
|
validate_device(x, y)
|
|
36
35
|
validate_contiguous(x, y)
|
|
@@ -110,17 +109,13 @@ def broadcast_add_kernel(x,
|
|
|
110
109
|
pid_col = tl.program_id(axis=2)
|
|
111
110
|
|
|
112
111
|
# Get position of current sparsity block consisting of its batch, row, and column index
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
112
|
+
spa_val_idx = pid_blk * s_lut_o_r_s + tl.arange(0, 4) * s_lut_o_c_s
|
|
113
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
114
|
+
spa_val = tl.load(s_lut_o + spa_val_idx, mask=spa_val_msk)
|
|
116
115
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
spa_col_o_idx = (pid_blk * s_lut_o_r_s + 2 * s_lut_o_c_s)
|
|
122
|
-
spa_col_o_msk = (spa_col_o_idx >= 0 and spa_col_o_idx < s_lut_o_r * s_lut_o_r_s)
|
|
123
|
-
spa_col_o = tl.load(s_lut_o + spa_col_o_idx, mask=spa_col_o_msk)
|
|
116
|
+
spa_bat_o = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
117
|
+
spa_row_o = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
118
|
+
spa_col_o = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
124
119
|
|
|
125
120
|
# Load x block
|
|
126
121
|
blk_x_idx = (spa_bat_o * x_b_s +
|
blksprs/ops/misc/row_wise.py
CHANGED
|
@@ -8,7 +8,7 @@ from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_config
|
|
|
8
8
|
from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
9
9
|
from blksprs.utils.tools import stride
|
|
10
10
|
from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, validate_sparsity, \
|
|
11
|
-
validate_sparsity_block_size
|
|
11
|
+
validate_sparsity_block_size, ensure_contiguous
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float32)
|
|
@@ -34,7 +34,7 @@ def row_wise_sum(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
|
|
|
34
34
|
of the input and the sparsity layout of the output tensor.
|
|
35
35
|
|
|
36
36
|
"""
|
|
37
|
-
x = x
|
|
37
|
+
x = ensure_contiguous(x)
|
|
38
38
|
|
|
39
39
|
validate_dimensions(x)
|
|
40
40
|
validate_contiguous(x)
|
|
@@ -119,17 +119,17 @@ def row_wise_sum_kernel(x,
|
|
|
119
119
|
pid_col = tl.program_id(axis=2)
|
|
120
120
|
|
|
121
121
|
# Get position of current sparsity block consisting of its batch and row index
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
122
|
+
spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
|
|
123
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
124
|
+
spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
127
|
+
spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
128
|
+
spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
129
129
|
|
|
130
130
|
# Load reverse sparsity index for current block
|
|
131
|
-
rev_idx_spa_idx = (
|
|
132
|
-
|
|
131
|
+
rev_idx_spa_idx = (spa_bat_x * s_l_o_b_s +
|
|
132
|
+
spa_row_x * s_l_o_r_s)
|
|
133
133
|
rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
|
|
134
134
|
rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
|
|
135
135
|
|
|
@@ -176,7 +176,7 @@ def row_wise_max(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size:
|
|
|
176
176
|
"""
|
|
177
177
|
# TODO Fix for triton bug, see https://github.com/triton-lang/triton/issues/6376, should be fixed with the upcoming 3.4.0 release
|
|
178
178
|
x = torch.where(x == -0.0, torch.tensor(0.0), x)
|
|
179
|
-
x = x
|
|
179
|
+
x = ensure_contiguous(x)
|
|
180
180
|
|
|
181
181
|
validate_dimensions(x)
|
|
182
182
|
validate_contiguous(x)
|
|
@@ -263,17 +263,17 @@ def row_wise_max_kernel(x,
|
|
|
263
263
|
pid_col = tl.program_id(axis=2)
|
|
264
264
|
|
|
265
265
|
# Get position of current sparsity block consisting of its batch and row index
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
266
|
+
spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
|
|
267
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
268
|
+
spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
|
|
269
269
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
270
|
+
spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
271
|
+
spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
272
|
+
spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
273
273
|
|
|
274
274
|
# Load reverse sparsity index for current block
|
|
275
|
-
rev_idx_spa_idx = (
|
|
276
|
-
|
|
275
|
+
rev_idx_spa_idx = (spa_bat_x * s_l_o_b_s +
|
|
276
|
+
spa_row_x * s_l_o_r_s)
|
|
277
277
|
rev_idx_spa_msk = (rev_idx_spa_idx >= 0 and rev_idx_spa_idx < s_l_o_b * s_l_o_b_s)
|
|
278
278
|
rev_idx_spa = tl.load(r_lut_o + rev_idx_spa_idx, mask=rev_idx_spa_msk).to(tl.int32)
|
|
279
279
|
|
|
@@ -311,6 +311,8 @@ def row_wise_add(x: BlksprsTensor, sparsity_layout_x: Tensor, y: Tensor,
|
|
|
311
311
|
compressed form.
|
|
312
312
|
|
|
313
313
|
"""
|
|
314
|
+
x = ensure_contiguous(x)
|
|
315
|
+
|
|
314
316
|
validate_dimensions(x)
|
|
315
317
|
validate_contiguous(x)
|
|
316
318
|
validate_device(x)
|
|
@@ -361,7 +363,7 @@ def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
|
|
|
361
363
|
triton.cdiv(o_r, meta["TRITON_BLOCK_SIZE"]),
|
|
362
364
|
triton.cdiv(o_c, meta["TRITON_BLOCK_SIZE"])]
|
|
363
365
|
|
|
364
|
-
(wrap_triton(
|
|
366
|
+
(wrap_triton(row_wise_add_kernel)[triton_grid]
|
|
365
367
|
(x,
|
|
366
368
|
x_b, x_b_s, x_r_s, x_c_s,
|
|
367
369
|
sparsity_lut_x, s_lut_r, s_lut_r_s, s_lut_c_s,
|
|
@@ -383,33 +385,33 @@ def row_wise_add_forward(x: Tensor, sparsity_lut_x: Tensor,
|
|
|
383
385
|
reset_to_zero=["o"]
|
|
384
386
|
)
|
|
385
387
|
@triton.jit
|
|
386
|
-
def
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
388
|
+
def row_wise_add_kernel(x,
|
|
389
|
+
x_b, x_b_s, x_r_s, x_c_s,
|
|
390
|
+
s_lut_x, s_lut_x_r, s_lut_x_r_s, s_lut_x_c_s,
|
|
391
|
+
y, y_b, y_b_s, y_r_s, y_c_s,
|
|
392
|
+
s_l_y_b, s_l_y_b_s, s_l_y_r_s,
|
|
393
|
+
r_lut_y,
|
|
394
|
+
o,
|
|
395
|
+
o_b, o_b_s, o_r_s, o_c_s,
|
|
396
|
+
sparsity_block_size,
|
|
397
|
+
TRITON_BLOCK_SIZE: tl.constexpr) -> None:
|
|
396
398
|
# Get triton block indices
|
|
397
399
|
pid_blk = tl.program_id(axis=0)
|
|
398
400
|
pid_row = tl.program_id(axis=1)
|
|
399
401
|
pid_col = tl.program_id(axis=2)
|
|
400
402
|
|
|
401
403
|
# Get position of current sparsity block consisting of its batch and row index
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
404
|
+
spa_val_idx = pid_blk * s_lut_x_r_s + tl.arange(0, 4) * s_lut_x_c_s
|
|
405
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
406
|
+
spa_val = tl.load(s_lut_x + spa_val_idx, mask=spa_val_msk)
|
|
405
407
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
408
|
+
spa_bat_x = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
409
|
+
spa_row_x = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
410
|
+
spa_col_x = tl.sum(spa_val * (tl.arange(0, 4) == 2))
|
|
409
411
|
|
|
410
412
|
# Get reverse sparsity indices for s
|
|
411
|
-
rev_idx_spa_s_idx = (
|
|
412
|
-
|
|
413
|
+
rev_idx_spa_s_idx = (spa_bat_x * s_l_y_b_s +
|
|
414
|
+
spa_row_x * s_l_y_r_s)
|
|
413
415
|
rev_idx_spa_s_msk = (rev_idx_spa_s_idx >= 0 and rev_idx_spa_s_idx < s_l_y_b * s_l_y_b_s)
|
|
414
416
|
rev_idx_spa_s = tl.load(r_lut_y + rev_idx_spa_s_idx, mask=rev_idx_spa_s_msk).to(tl.int32)
|
|
415
417
|
|
blksprs/ops/partitioning.py
CHANGED
|
@@ -5,7 +5,7 @@ from torch._library import triton_op
|
|
|
5
5
|
from blksprs.ops.flow import flow_pull_forward
|
|
6
6
|
from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
7
7
|
from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
|
|
8
|
-
validate_sparsity, validate_sparsity_block_size
|
|
8
|
+
validate_sparsity, validate_sparsity_block_size, ensure_contiguous
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
|
|
@@ -27,7 +27,7 @@ def split(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
|
|
|
27
27
|
Tensor: The sparsity layout of the output tensor.
|
|
28
28
|
|
|
29
29
|
"""
|
|
30
|
-
x = x
|
|
30
|
+
x = ensure_contiguous(x)
|
|
31
31
|
|
|
32
32
|
validate_dimensions(x)
|
|
33
33
|
validate_contiguous(x)
|
|
@@ -132,7 +132,7 @@ def merge(x: BlksprsTensor, sparsity_layout: Tensor, partitions: int,
|
|
|
132
132
|
Tensor: The sparsity layout of the output tensor.
|
|
133
133
|
|
|
134
134
|
"""
|
|
135
|
-
x = x
|
|
135
|
+
x = ensure_contiguous(x)
|
|
136
136
|
|
|
137
137
|
validate_dimensions(x)
|
|
138
138
|
validate_contiguous(x)
|
blksprs/ops/repeat.py
CHANGED
|
@@ -5,7 +5,7 @@ from torch._library import triton_op
|
|
|
5
5
|
from blksprs.ops.flow import flow_pull_forward, flow_push_forward
|
|
6
6
|
from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
7
7
|
from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
|
|
8
|
-
validate_sparsity, validate_sparsity_block_size
|
|
8
|
+
validate_sparsity, validate_sparsity_block_size, ensure_contiguous
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
|
|
@@ -36,7 +36,7 @@ def repeat(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: tuple[int, int,
|
|
|
36
36
|
Tensor: The sparsity layout of the resulting output tensor.
|
|
37
37
|
|
|
38
38
|
"""
|
|
39
|
-
x = x
|
|
39
|
+
x = ensure_contiguous(x)
|
|
40
40
|
|
|
41
41
|
validate_dimensions(x)
|
|
42
42
|
validate_contiguous(x)
|
|
@@ -77,7 +77,7 @@ def repeat_interleave(x: BlksprsTensor, sparsity_layout_x: Tensor, repeats: int,
|
|
|
77
77
|
Tensor: The sparsity layout of the resulting output tensor.
|
|
78
78
|
|
|
79
79
|
"""
|
|
80
|
-
x = x
|
|
80
|
+
x = ensure_contiguous(x)
|
|
81
81
|
|
|
82
82
|
validate_dimensions(x)
|
|
83
83
|
validate_contiguous(x)
|
|
@@ -142,7 +142,7 @@ def repeat_build_lut(lut: dict, sparsity_layout_x: Tensor, repeats: tuple[int, i
|
|
|
142
142
|
n_sparse_blocks = torch.sum(lut["sparsity_layout_o"].to(torch.int)).item()
|
|
143
143
|
lut["n_sparse_blocks"] = n_sparse_blocks
|
|
144
144
|
|
|
145
|
-
validate_contiguous(sparsity_layout_o, lut["sparsity_lut"], lut["sparsity_reverse_lut"])
|
|
145
|
+
validate_contiguous(lut["sparsity_layout_o"], lut["sparsity_lut"], lut["sparsity_reverse_lut"])
|
|
146
146
|
|
|
147
147
|
return lut
|
|
148
148
|
|
|
@@ -178,7 +178,7 @@ def repeat_interleave_build_lut(lut: dict, sparsity_layout_x: Tensor, repeats: i
|
|
|
178
178
|
n_sparse_blocks = torch.sum(lut["sparsity_layout_o"].to(torch.int)).item()
|
|
179
179
|
lut["n_sparse_blocks"] = n_sparse_blocks
|
|
180
180
|
|
|
181
|
-
validate_contiguous(sparsity_layout_o, lut["sparsity_lut"], lut["sparsity_reverse_lut"])
|
|
181
|
+
validate_contiguous(lut["sparsity_layout_o"], lut["sparsity_lut"], lut["sparsity_reverse_lut"])
|
|
182
182
|
|
|
183
183
|
return lut
|
|
184
184
|
|
blksprs/ops/softmax.py
CHANGED
|
@@ -12,7 +12,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
|
12
12
|
from blksprs.utils.tools import stride, ceil_pow2
|
|
13
13
|
from blksprs.utils.autotuning import get_autotune_configs, prune_autotune_configs
|
|
14
14
|
from blksprs.utils.validation import validate_contiguous, validate_dimensions, validate_device, \
|
|
15
|
-
validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32
|
|
15
|
+
validate_sparsity, validate_sparsity_block_size, validate_dtype_float_32, ensure_contiguous
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def softmax(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size: int, flag_fused: bool = True,
|
|
@@ -44,7 +44,7 @@ def softmax_regular(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_si
|
|
|
44
44
|
BlksprsTensor: The result of the softmax operation as a block-sparse tensor in compressed form.
|
|
45
45
|
|
|
46
46
|
"""
|
|
47
|
-
x = x
|
|
47
|
+
x = ensure_contiguous(x)
|
|
48
48
|
|
|
49
49
|
validate_dimensions(x)
|
|
50
50
|
validate_contiguous(x)
|
|
@@ -176,13 +176,12 @@ def softmax_kernel(x,
|
|
|
176
176
|
pid_col = tl.program_id(axis=2)
|
|
177
177
|
|
|
178
178
|
# Get position of current sparsity block consisting of its batch and row index
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
179
|
+
spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
|
|
180
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
181
|
+
spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
|
|
182
182
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
|
|
183
|
+
spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
184
|
+
spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
186
185
|
|
|
187
186
|
# Get reverse sparsity indices for s
|
|
188
187
|
rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
|
|
@@ -241,13 +240,12 @@ def softmax_kernel_grad(g,
|
|
|
241
240
|
pid_col = tl.program_id(axis=2)
|
|
242
241
|
|
|
243
242
|
# Get position of current sparsity block consisting of its batch and row index
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
243
|
+
spa_val_idx = pid_blk * s_lut_r_s + tl.arange(0, 4) * s_lut_c_s
|
|
244
|
+
spa_val_msk = (tl.arange(0, 4) < 3)
|
|
245
|
+
spa_val = tl.load(s_lut + spa_val_idx, mask=spa_val_msk)
|
|
247
246
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
spa_row = tl.load(s_lut + spa_row_idx, mask=spa_row_msk)
|
|
247
|
+
spa_bat = tl.sum(spa_val * (tl.arange(0, 4) == 0))
|
|
248
|
+
spa_row = tl.sum(spa_val * (tl.arange(0, 4) == 1))
|
|
251
249
|
|
|
252
250
|
rev_idx_spa_s_idx = (spa_bat * s_l_s_b_s +
|
|
253
251
|
spa_row * s_l_s_r_s)
|
|
@@ -337,7 +335,7 @@ def softmax_fused(x: BlksprsTensor, sparsity_layout: Tensor, sparsity_block_size
|
|
|
337
335
|
BlksprsTensor: The result of the softmax operation as a block-sparse tensor in compressed form.
|
|
338
336
|
|
|
339
337
|
"""
|
|
340
|
-
x = x
|
|
338
|
+
x = ensure_contiguous(x)
|
|
341
339
|
|
|
342
340
|
validate_dimensions(x)
|
|
343
341
|
validate_contiguous(x)
|
blksprs/ops/transpose.py
CHANGED
|
@@ -5,7 +5,7 @@ from torch._library import triton_op
|
|
|
5
5
|
from blksprs.ops.flow import flow_pull_forward
|
|
6
6
|
from blksprs.utils.blksprs_tensor import BlksprsTensor
|
|
7
7
|
from blksprs.utils.validation import validate_dimensions, validate_contiguous, validate_device, \
|
|
8
|
-
validate_sparsity, validate_sparsity_block_size
|
|
8
|
+
validate_sparsity, validate_sparsity_block_size, ensure_contiguous
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
|
|
@@ -27,7 +27,7 @@ def transpose(x: BlksprsTensor, sparsity_layout: Tensor,
|
|
|
27
27
|
Tensor: The sparsity layout of the transposed tensor.
|
|
28
28
|
|
|
29
29
|
"""
|
|
30
|
-
x = x
|
|
30
|
+
x = ensure_contiguous(x)
|
|
31
31
|
|
|
32
32
|
validate_dimensions(x)
|
|
33
33
|
validate_contiguous(x)
|
blksprs/utils/autotuning.py
CHANGED
blksprs/utils/validation.py
CHANGED
|
@@ -1,9 +1,18 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from torch import Tensor
|
|
3
3
|
|
|
4
|
+
CONTIGUOUS = True
|
|
4
5
|
VALIDATION = True
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
def ensure_contiguous(*tensors: Tensor) -> tuple[Tensor, ...]:
|
|
9
|
+
if _check_skip_contiguous():
|
|
10
|
+
return tensors
|
|
11
|
+
|
|
12
|
+
transformed = tuple(tensor.contiguous() for tensor in tensors)
|
|
13
|
+
return transformed[0] if len(transformed) == 1 else transformed
|
|
14
|
+
|
|
15
|
+
|
|
7
16
|
def validate_dimensions(*tensors: Tensor, dims=3) -> None:
|
|
8
17
|
if _check_skip_validation():
|
|
9
18
|
return
|
|
@@ -124,6 +133,19 @@ def validate_sparsity_block_size(sparsity_block_size: int, *tensors):
|
|
|
124
133
|
raise ValueError("Tensor sizes must be divisible by sparsity block size")
|
|
125
134
|
|
|
126
135
|
|
|
136
|
+
def _check_skip_contiguous():
|
|
137
|
+
return not CONTIGUOUS
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _set_skip_contiguous(skip_contiguous: bool):
|
|
141
|
+
global CONTIGUOUS
|
|
142
|
+
CONTIGUOUS = not skip_contiguous
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def disable_contiguous():
|
|
146
|
+
_set_skip_contiguous(True)
|
|
147
|
+
|
|
148
|
+
|
|
127
149
|
def _check_skip_validation():
|
|
128
150
|
return not VALIDATION
|
|
129
151
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: blksprs
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.6
|
|
4
4
|
Summary: A lightweight library for operations on block-sparse matrices in PyTorch.
|
|
5
5
|
Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
|
|
6
6
|
Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
blksprs/__init__.py,sha256=F3JKF_GqJrXYtM31r9eT8kSamz8I8bav-6lbQUiLHfA,1631
|
|
2
|
+
blksprs/layouting/distribution_layout.py,sha256=ur1ty_2U-Hfj78hMWsLZvu7ZuGhzW3qGLKMc72DfTZM,5861
|
|
3
|
+
blksprs/layouting/sparsity_layout.py,sha256=eXHmu2h7K5Q-YUpfOxocJoeP_5ZoQFZf_eHLxRZQbYU,11207
|
|
4
|
+
blksprs/ops/conversion.py,sha256=NcBxWWWzMkjQx_fEfh14RWt688X6J82FzDqByAd3Pj4,21405
|
|
5
|
+
blksprs/ops/distribution.py,sha256=pabgyw0m3A4A0osfnOoKffk-b2BKXCn-lC6BU26ocKY,20180
|
|
6
|
+
blksprs/ops/flow.py,sha256=JEGES5ZbMqxR02rwi2Ym4j3VDxkcRxhFO1f-5nNUlM8,7760
|
|
7
|
+
blksprs/ops/matmul.py,sha256=9XPsKbYBw0cdmZY6i4T3Phbx00LXIuA6KI0EIcyGo9U,11584
|
|
8
|
+
blksprs/ops/partitioning.py,sha256=67_a9a5ZpsRmB4BVTOks0stFWp34cb0nk28zQFkXEZc,9985
|
|
9
|
+
blksprs/ops/repeat.py,sha256=Eo7L-TcrrXb_I6xKXLVklp1EuCuA0sfhPaOzw_8y1eU,9080
|
|
10
|
+
blksprs/ops/softmax.py,sha256=YcoZpdC1BdL4zKRQOSjIRtfGgDoQvUZabgNmjbeY8-4,23470
|
|
11
|
+
blksprs/ops/transpose.py,sha256=AyIPuiMAtUAPJPs9eK-Apz6vjZdmnJO9RF6_yH6u6Fk,4097
|
|
12
|
+
blksprs/ops/misc/broadcast_ops.py,sha256=ro7K2ZMOsscxNEp2HY_6efqJ4Wrf-QCFL4NLeDqvah8,5692
|
|
13
|
+
blksprs/ops/misc/row_wise.py,sha256=dfhuXexyFBaNvfZjOt9w3s29ih19JhWIy04_FhUnHgk,19420
|
|
14
|
+
blksprs/utils/autotuning.py,sha256=xalNP3sWdRn8XiVG4jE1-_iy2QhUmIJvTGM83YwgKA0,2052
|
|
15
|
+
blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
|
|
16
|
+
blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
|
|
17
|
+
blksprs/utils/processing.py,sha256=RNkEDc0g-sNHRuMPkRzNWU13d3_lIkXMJdoqES4yQTM,3738
|
|
18
|
+
blksprs/utils/tools.py,sha256=TKygEKge4wJtJnXXDg8BTL8vzBpqIJsQ_A3_5FmLpcE,859
|
|
19
|
+
blksprs/utils/validation.py,sha256=XmDMAVSg7SHd7KZswFGU_2kshSTWe0dI6yB5iSGj6cQ,4850
|
|
20
|
+
blksprs-2.1.6.dist-info/METADATA,sha256=jALZxAvt1JAvlQc219KI5mRqHsCq624d0P8LzyLxe9Q,9590
|
|
21
|
+
blksprs-2.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
22
|
+
blksprs-2.1.6.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
|
|
23
|
+
blksprs-2.1.6.dist-info/RECORD,,
|
blksprs-2.1.4.dist-info/RECORD
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
blksprs/__init__.py,sha256=XERzTtkiElDeBppOO8rNrF6OktUQf_yozDiA4DUXqTY,1615
|
|
2
|
-
blksprs/layouting/distribution_layout.py,sha256=ur1ty_2U-Hfj78hMWsLZvu7ZuGhzW3qGLKMc72DfTZM,5861
|
|
3
|
-
blksprs/layouting/sparsity_layout.py,sha256=eXHmu2h7K5Q-YUpfOxocJoeP_5ZoQFZf_eHLxRZQbYU,11207
|
|
4
|
-
blksprs/ops/conversion.py,sha256=nv5gXiyZkUtk1kCIlPr0Vpaj4G8G6dJdW7StlbV3nDw,21914
|
|
5
|
-
blksprs/ops/distribution.py,sha256=0tPldv0ARzmCV1CU2jvfqpHBgOuHPrDFiCtqsLs7CZc,20789
|
|
6
|
-
blksprs/ops/flow.py,sha256=oUn_xDT74220-EmnBnB8bRNtbS1mjbxWpm76PFsK22o,8246
|
|
7
|
-
blksprs/ops/matmul.py,sha256=ES9bpiCIRBxaynNIL5ftDP0c9LSArbj8YJqkPEzBaIU,11879
|
|
8
|
-
blksprs/ops/partitioning.py,sha256=cfQmY9BZqGTvvJorIhtb-EyuGRJGPraWR-wTKdb47aI,9954
|
|
9
|
-
blksprs/ops/repeat.py,sha256=TLYNxwPuT9y5K9xyM41WK5gnggAJF3lI61Q2K7zWjns,9035
|
|
10
|
-
blksprs/ops/softmax.py,sha256=tfC_jaAKrA956rxGeb57klMuYRKTiyMCd5Zg5DIH3fc,23649
|
|
11
|
-
blksprs/ops/transpose.py,sha256=U-VAyLRT6_NDv9qYSFzBqfVlDeIpTqAMEXkqto0VF6w,4072
|
|
12
|
-
blksprs/ops/misc/broadcast_ops.py,sha256=-PrHiSJikZh8nXUmXxSCtFEP27TTxFr4wcrNxBjnimk,5987
|
|
13
|
-
blksprs/ops/misc/row_wise.py,sha256=n5FJjAuOd8BHBJQx4bsQwr-HmXkR9PYVAqfk77wjOFU,19653
|
|
14
|
-
blksprs/utils/autotuning.py,sha256=a-kmWRjJ3eED2XbjkQeOJSyW8bdIs27HgKMPvAKqWeU,2052
|
|
15
|
-
blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
|
|
16
|
-
blksprs/utils/blksprs_tensor.py,sha256=pfoz59aJixj_fIoFx76ySiygwRQUemmgjMKepZ2c4j0,244
|
|
17
|
-
blksprs/utils/processing.py,sha256=RNkEDc0g-sNHRuMPkRzNWU13d3_lIkXMJdoqES4yQTM,3738
|
|
18
|
-
blksprs/utils/tools.py,sha256=TKygEKge4wJtJnXXDg8BTL8vzBpqIJsQ_A3_5FmLpcE,859
|
|
19
|
-
blksprs/utils/validation.py,sha256=G8eQlvJVMKfEX3k2AwBD0A6Ck-gFoRLpLNY6HXsB3fA,4348
|
|
20
|
-
blksprs-2.1.4.dist-info/METADATA,sha256=qGLQunHEIoHlmRvFnM0TVDjOSApwGzBglpZezmfhHLU,9590
|
|
21
|
-
blksprs-2.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
22
|
-
blksprs-2.1.4.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
|
|
23
|
-
blksprs-2.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|