quack-kernels 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quack/__init__.py +1 -1
- quack/activation.py +72 -64
- quack/broadcast_utils.py +1 -1
- quack/copy_utils.py +14 -18
- quack/fast_math.py +29 -76
- quack/gemm_act.py +296 -8
- quack/gemm_dact.py +520 -4
- quack/gemm_default_epi.py +4 -4
- quack/gemm_interface.py +363 -0
- quack/gemm_sm100.py +62 -88
- quack/gemm_sm90.py +68 -114
- quack/gemm_symmetric.py +2 -6
- quack/layout_utils.py +2 -4
- quack/linear.py +37 -0
- quack/pipeline.py +59 -89
- quack/reduce.py +2 -2
- quack/rmsnorm.py +1 -3
- quack/sm90_utils.py +5 -3
- quack/sort/bitonic_sort.py +3 -3
- quack/tile_scheduler.py +310 -256
- quack/topk.py +4 -4
- quack/utils.py +76 -40
- {quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/METADATA +2 -2
- quack_kernels-0.2.6.dist-info/RECORD +45 -0
- quack_kernels-0.2.5.dist-info/RECORD +0 -45
- {quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/WHEEL +0 -0
- {quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/licenses/LICENSE +0 -0
- {quack_kernels-0.2.5.dist-info → quack_kernels-0.2.6.dist-info}/top_level.txt +0 -0
quack/topk.py
CHANGED
|
@@ -105,7 +105,7 @@ class TopK:
|
|
|
105
105
|
|
|
106
106
|
if tXcX[0][0] < shape[0]:
|
|
107
107
|
copy(tXgX, tXrX)
|
|
108
|
-
tXrX_f32 = cute.
|
|
108
|
+
tXrX_f32 = cute.make_rmem_tensor(tXrX.shape, Float32)
|
|
109
109
|
tXrX_f32.store(tXrX.load().to(Float32))
|
|
110
110
|
|
|
111
111
|
# Encode the indices into the bottom bits of values.
|
|
@@ -138,7 +138,7 @@ class TopK:
|
|
|
138
138
|
# 1 -> 0b11111, 2 -> 0b11110, 4 -> 0b11100, 8 -> 0b11000, 16 -> 0b10000, 32 -> 0b00000
|
|
139
139
|
mask = cute.arch.WARP_SIZE - threads_per_row
|
|
140
140
|
mask_and_clamp = mask << 8 | (cute.arch.WARP_SIZE - 1)
|
|
141
|
-
topk_vals_split = cute.
|
|
141
|
+
topk_vals_split = cute.make_rmem_tensor((vecsize_out, nvec_per_thread), Float32)
|
|
142
142
|
for i in cutlass.range(cute.ceil_div(self.k, vecsize_out), unroll_full=True):
|
|
143
143
|
should_receive = tidx % threads_per_row == i % threads_per_row
|
|
144
144
|
for v in cutlass.range(vecsize_out, unroll_full=True):
|
|
@@ -154,7 +154,7 @@ class TopK:
|
|
|
154
154
|
|
|
155
155
|
# Extract indices and clean values
|
|
156
156
|
topk_vals_i32 = cute.recast_tensor(topk_vals_split, Int32)
|
|
157
|
-
topk_indices = cute.
|
|
157
|
+
topk_indices = cute.make_rmem_tensor(topk_vals_i32.shape, Int32)
|
|
158
158
|
for i in cutlass.range(cute.size(topk_vals_i32), unroll_full=True):
|
|
159
159
|
# Extract the encoded index from the last log_N bits
|
|
160
160
|
encoded_idx = topk_vals_i32[i] & idx_mask
|
|
@@ -420,7 +420,7 @@ class TopKBackward(ReductionBase):
|
|
|
420
420
|
grads = vals_f32 * (dvals_f32 - dot)
|
|
421
421
|
else:
|
|
422
422
|
grads = dvals_f32
|
|
423
|
-
grad_cvt = cute.
|
|
423
|
+
grad_cvt = cute.make_rmem_tensor(tXrdV.shape, mdX.element_type)
|
|
424
424
|
grad_cvt.store(grads.to(mdX.element_type))
|
|
425
425
|
|
|
426
426
|
# Scatter values to smem
|
quack/utils.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
|
|
2
2
|
|
|
3
3
|
import math
|
|
4
|
-
from functools import partial
|
|
5
4
|
from typing import Optional, Tuple, Union
|
|
6
5
|
|
|
7
6
|
import cutlass
|
|
@@ -12,18 +11,6 @@ from cutlass.cutlass_dsl import T, dsl_user_op
|
|
|
12
11
|
from cutlass._mlir.dialects import llvm, nvvm, vector
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
# cute.arch.{fma,mul,add}_packed_f32x2 uses RZ rounding mode by default
|
|
16
|
-
fma_packed_f32x2 = partial(cute.arch.fma_packed_f32x2, rnd=nvvm.RoundingModeKind.RN)
|
|
17
|
-
mul_packed_f32x2 = partial(cute.arch.mul_packed_f32x2, rnd=nvvm.RoundingModeKind.RN)
|
|
18
|
-
add_packed_f32x2 = partial(cute.arch.add_packed_f32x2, rnd=nvvm.RoundingModeKind.RN)
|
|
19
|
-
sub_packed_f32x2 = partial(
|
|
20
|
-
cute.arch.calc_packed_f32x2_op,
|
|
21
|
-
src_c=None,
|
|
22
|
-
calc_func=nvvm.sub_packed_f32x2,
|
|
23
|
-
rnd=nvvm.RoundingModeKind.RN,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
|
|
27
14
|
@dsl_user_op
|
|
28
15
|
def elem_pointer(x: cute.Tensor, coord: cute.Coord, *, loc=None, ip=None) -> cute.Pointer:
|
|
29
16
|
return x.iterator + cute.crd2idx(coord, x.layout, loc=loc, ip=ip)
|
|
@@ -89,6 +76,54 @@ def store_shared_remote(
|
|
|
89
76
|
)
|
|
90
77
|
|
|
91
78
|
|
|
79
|
+
@dsl_user_op
|
|
80
|
+
def store_shared_remote_x4(
|
|
81
|
+
val0: Float32 | Int32,
|
|
82
|
+
val1: Float32 | Int32,
|
|
83
|
+
val2: Float32 | Int32,
|
|
84
|
+
val3: Float32 | Int32,
|
|
85
|
+
smem_ptr: cute.Pointer,
|
|
86
|
+
mbar_ptr: cute.Pointer,
|
|
87
|
+
peer_cta_rank_in_cluster: cute.typing.Int,
|
|
88
|
+
*,
|
|
89
|
+
loc=None,
|
|
90
|
+
ip=None,
|
|
91
|
+
) -> None:
|
|
92
|
+
remote_smem_ptr_i32 = set_block_rank(
|
|
93
|
+
smem_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
|
|
94
|
+
).ir_value()
|
|
95
|
+
remote_mbar_ptr_i32 = set_block_rank(
|
|
96
|
+
mbar_ptr, peer_cta_rank_in_cluster, loc=loc, ip=ip
|
|
97
|
+
).ir_value()
|
|
98
|
+
assert isinstance(val0, (Float32, Int32)), "val must be Float32, or Int32"
|
|
99
|
+
dtype = Float32 if isinstance(val0, Float32) else Int32
|
|
100
|
+
suffix = {Float32: "f32", Int32: "s32"}[dtype]
|
|
101
|
+
constraint = {Float32: "f", Int32: "r"}[dtype]
|
|
102
|
+
llvm.inline_asm(
|
|
103
|
+
None,
|
|
104
|
+
[
|
|
105
|
+
remote_smem_ptr_i32,
|
|
106
|
+
remote_mbar_ptr_i32,
|
|
107
|
+
dtype(val0).ir_value(loc=loc, ip=ip),
|
|
108
|
+
dtype(val1).ir_value(loc=loc, ip=ip),
|
|
109
|
+
dtype(val2).ir_value(loc=loc, ip=ip),
|
|
110
|
+
dtype(val3).ir_value(loc=loc, ip=ip),
|
|
111
|
+
],
|
|
112
|
+
"{\n\t"
|
|
113
|
+
f".reg .v4 .{suffix} abcd;\n\t"
|
|
114
|
+
f"mov.{suffix} abcd.x, $2;\n\t"
|
|
115
|
+
f"mov.{suffix} abcd.y, $3;\n\t"
|
|
116
|
+
f"mov.{suffix} abcd.z, $4;\n\t"
|
|
117
|
+
f"mov.{suffix} abcd.w, $5;\n\t"
|
|
118
|
+
f"st.async.shared::cluster.mbarrier::complete_tx::bytes.v4.{suffix} [$0], abcd, [$1];\n\t"
|
|
119
|
+
"}\n",
|
|
120
|
+
f"r,r,{constraint},{constraint},{constraint},{constraint}",
|
|
121
|
+
has_side_effects=True,
|
|
122
|
+
is_align_stack=False,
|
|
123
|
+
asm_dialect=llvm.AsmDialect.AD_ATT,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
92
127
|
@dsl_user_op
|
|
93
128
|
def fmin(a: Union[float, Float32], b: Union[float, Float32], *, loc=None, ip=None) -> Float32:
|
|
94
129
|
return Float32(
|
|
@@ -132,25 +167,6 @@ def ceil(a: float | Float32, *, loc=None, ip=None) -> Int32:
|
|
|
132
167
|
)
|
|
133
168
|
|
|
134
169
|
|
|
135
|
-
@dsl_user_op
|
|
136
|
-
def prmt(a: int | Int32, b: int | Int32, c: int | Int32, *, loc=None, ip=None) -> Int32:
|
|
137
|
-
return Int32(
|
|
138
|
-
llvm.inline_asm(
|
|
139
|
-
T.i32(),
|
|
140
|
-
[
|
|
141
|
-
Int32(a).ir_value(loc=loc, ip=ip),
|
|
142
|
-
Int32(b).ir_value(loc=loc, ip=ip),
|
|
143
|
-
Int32(c).ir_value(loc=loc, ip=ip),
|
|
144
|
-
],
|
|
145
|
-
"prmt.b32 $0, $1, $2, $3;",
|
|
146
|
-
"=r,r,r,r",
|
|
147
|
-
has_side_effects=False,
|
|
148
|
-
is_align_stack=False,
|
|
149
|
-
asm_dialect=llvm.AsmDialect.AD_ATT,
|
|
150
|
-
)
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
|
|
154
170
|
@cute.jit
|
|
155
171
|
def fill_oob(tXsX: cute.Tensor, tXpX: Optional[cute.Tensor], fill_value: cute.Numeric) -> None:
|
|
156
172
|
"""Fill out-of-bounds values in shared memory tensor.
|
|
@@ -210,14 +226,34 @@ def warp_prefix_sum(val: Int32, lane: Optional[Int32] = None) -> Int32:
|
|
|
210
226
|
|
|
211
227
|
|
|
212
228
|
@dsl_user_op
|
|
213
|
-
def
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
229
|
+
def atomic_inc_i32(a: int | Int32, gmem_ptr: cute.Pointer, *, loc=None, ip=None) -> Int32:
|
|
230
|
+
from cutlass import CUDA_VERSION
|
|
231
|
+
|
|
232
|
+
# * NVVM call based on nvvm version
|
|
233
|
+
if CUDA_VERSION.major == 12 and CUDA_VERSION.minor == 9:
|
|
234
|
+
# Old API: requires explicit result type as first positional argument
|
|
235
|
+
return nvvm.atomicrmw(
|
|
236
|
+
res=T.i32(), op=nvvm.AtomicOpKind.INC, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
# New API: infers result type automatically
|
|
240
|
+
return nvvm.atomicrmw(
|
|
241
|
+
op=nvvm.AtomicOpKind.INC, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
|
|
242
|
+
)
|
|
217
243
|
|
|
218
244
|
|
|
219
245
|
@dsl_user_op
|
|
220
|
-
def
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
246
|
+
def atomic_add_i32(a: int | Int32, gmem_ptr: cute.Pointer, *, loc=None, ip=None) -> Int32:
|
|
247
|
+
from cutlass import CUDA_VERSION
|
|
248
|
+
|
|
249
|
+
# * NVVM call based on nvvm version
|
|
250
|
+
if CUDA_VERSION.major == 12 and CUDA_VERSION.minor == 9:
|
|
251
|
+
# Old API: requires explicit result type as first positional argument
|
|
252
|
+
return nvvm.atomicrmw(
|
|
253
|
+
res=T.i32(), op=nvvm.AtomicOpKind.ADD, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
# New API: infers result type automatically
|
|
257
|
+
return nvvm.atomicrmw(
|
|
258
|
+
op=nvvm.AtomicOpKind.ADD, ptr=gmem_ptr.llvm_ptr, a=Int32(a).ir_value()
|
|
259
|
+
)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: quack-kernels
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Requires-Python: >=3.10
|
|
5
5
|
License-File: LICENSE
|
|
6
|
-
Requires-Dist: nvidia-cutlass-dsl>=4.4.0.
|
|
6
|
+
Requires-Dist: nvidia-cutlass-dsl>=4.4.0.dev1
|
|
7
7
|
Requires-Dist: torch
|
|
8
8
|
Requires-Dist: apache-tvm-ffi<0.2,>=0.1.6
|
|
9
9
|
Requires-Dist: torch-c-dlpack-ext
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
quack/__init__.py,sha256=GrpcEMDzBp43ePLzVo6TroIs5GcJv2dbypfFYpLGHmE,427
|
|
2
|
+
quack/activation.py,sha256=T_ypcXoz6a4wPPNZW2gKZuEj8JeucaKtKxQiQl5XrXc,21243
|
|
3
|
+
quack/autotuner.py,sha256=atw0ntedi22RPwSdjWOoge4S56S8VFvRocJQcYhpAlo,13454
|
|
4
|
+
quack/broadcast_utils.py,sha256=TwDdyR9BeIdTgCNJKSGIgN9N_mkQoCwtEnCmocPWSbw,1286
|
|
5
|
+
quack/compile_utils.py,sha256=qJ3oTsDlbAiddrJHtEO7LPYVqn_s-neNfiw-_KvfXZU,591
|
|
6
|
+
quack/copy_utils.py,sha256=GuO2DMXEzy0e3irfT51yIvUt4klWhhmlttgIJCC4tTY,23518
|
|
7
|
+
quack/cross_entropy.py,sha256=w6fjHC_vXt5ji2KfoLrSOdAvpLrQszrYU9rmRij2yY8,24899
|
|
8
|
+
quack/cute_dsl_ptxas.py,sha256=IfBnTJ9amdfDOQkuSdWCLTh7CkZziIvs_xrAc8taxhk,5122
|
|
9
|
+
quack/cute_dsl_utils.py,sha256=4uQx5aYDG9UvVzbWwJTjjJLrnoympz70_CD8b37FQWo,3854
|
|
10
|
+
quack/fast_math.py,sha256=4vrW_HB65rBlCyg0d6h6Y8Q4Gvjti5g74yqCv7acsxA,1020
|
|
11
|
+
quack/gemm.py,sha256=8V23MPq49QbV3csv-_AxjfE9qf8R3NIqFK9Q9db6t2c,7417
|
|
12
|
+
quack/gemm_act.py,sha256=_2NrcI2y0GM1MKwHjRc-6BZq8kthNXqSGxww1ci5Xac,31951
|
|
13
|
+
quack/gemm_config.py,sha256=94o3g9x7H0wi7aBbsb7H67H8nSzTurwL2zgvKDtQUas,3575
|
|
14
|
+
quack/gemm_dact.py,sha256=u2sLwNmaFZH8cetpwS-N0jQSSHpzc2-MNj01FhMpwR0,30116
|
|
15
|
+
quack/gemm_default_epi.py,sha256=1GdsILB1rrP7hrQDexeAn21R2qd_pBrdiDOOBi4r8Yk,11928
|
|
16
|
+
quack/gemm_interface.py,sha256=eoBtqqT7TQFb2gIQS8jTe7w1uFhXAMXqEcxU1RhAgng,60275
|
|
17
|
+
quack/gemm_sm100.py,sha256=lcFK5yKTvwc4RS28PbBHZBQeYCBIw9pi6s1SDZrifqs,120751
|
|
18
|
+
quack/gemm_sm90.py,sha256=ppru1dyrC4Ph0tGO-G_bbIBGqH-zxZ9r-IJqqQ0PTnM,88687
|
|
19
|
+
quack/gemm_symmetric.py,sha256=ljZtEuc_q3aABHlq4ukEM0vxBzkMqWVv2cIHPm1N17A,13161
|
|
20
|
+
quack/gemm_wrapper_utils.py,sha256=EaPyR3Lq19z_RkdB2_xxRj0IPSJMgyfpkrTXyvY3B6M,12775
|
|
21
|
+
quack/layout_utils.py,sha256=q2YkyAHcvT_DsEPc-UnJjGoSRWRxXEhYyDCT9uvz928,12187
|
|
22
|
+
quack/linear.py,sha256=xw2aGwIAAZUictUFHwRgefaT_9-mFfHZmYjFDL5_iMY,11311
|
|
23
|
+
quack/linear_cross_entropy.py,sha256=Zhy_gdMsKHOie-jntBaqIuiDJtkiq6qEBwnyuWwIRw4,10092
|
|
24
|
+
quack/mlp.py,sha256=YjdwQRwEePA9KyidFXp5H1-lxiJc8dZ41vl8Fv8pgss,2259
|
|
25
|
+
quack/pipeline.py,sha256=wQWT7h20qzQNlPj5Aub2uuQwzhDFSDuJzMFQawWdTHg,10504
|
|
26
|
+
quack/reduce.py,sha256=Yx-aB1gQOrKR2t3-joM5PCU0mcrPJLC6Jgs0FEZ6b3Q,12704
|
|
27
|
+
quack/reduction_base.py,sha256=QqlPs5L2VCxwDrO4CHPq-KY6f_BAYRbvsR6k81LPzTU,3180
|
|
28
|
+
quack/rmsnorm.py,sha256=TmDuLOSADfbTbHIRdF9Log7nCrEBGTWw6AiI2v12MFU,44580
|
|
29
|
+
quack/sm100_utils.py,sha256=-p5qj3Wi9n4WDLy2sl-fApYpGp5rH3JvZQb712OTxPs,1901
|
|
30
|
+
quack/sm90_utils.py,sha256=y1GPyzIm9Gxxobr9bNaKuH5LwiIJn5q9Q51P30iHi1w,5576
|
|
31
|
+
quack/softmax.py,sha256=ZqeVbnGfzwkro1LfWBHagbS7B7ug7b9SLZWuGx_Y3Kc,14367
|
|
32
|
+
quack/tensormap_manager.py,sha256=Ts3Mxp0_es2RNA0ffvUjWMXN79lsfWEBZ0DQYhtbcnw,5338
|
|
33
|
+
quack/tile_scheduler.py,sha256=H9Mhyup-8b-S4u3cS1NWIZrBpETporaif9U7ibA2-_A,44343
|
|
34
|
+
quack/topk.py,sha256=XnKvJD7sWdPg_1etExVPQif7I29EcY1fjqcY2QM9BWc,22534
|
|
35
|
+
quack/utils.py,sha256=DeABbD_2H63MGegcJFHR5qOiML2ujVh-PAo7qV3eZ6s,8731
|
|
36
|
+
quack/varlen_utils.py,sha256=SOYkomxX2FoqjYlybg99CqNhS9IARM6F9ba2AkIVvT4,15811
|
|
37
|
+
quack/sort/bitonic_sort.py,sha256=C4i9VfZOBGJbHNCOoXOwrHkCgLBr3c64Ueymc64d31U,4840
|
|
38
|
+
quack/sort/generate_sorting_networks.py,sha256=vkJBOjTVEinQkWT4OtFqOWxFVdTIPoNAQocneKc9-rM,14477
|
|
39
|
+
quack/sort/sorting_networks.py,sha256=l_26zi3gXD_z-tnm2eAczRrmE-mbaz00KmqH6ONivL8,9686
|
|
40
|
+
quack/sort/utils.py,sha256=RbubEY1GcEpsjiz_6o5o2WB47IeMOzaajW6Jis0s444,1059
|
|
41
|
+
quack_kernels-0.2.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
42
|
+
quack_kernels-0.2.6.dist-info/METADATA,sha256=OHPfmEcfEGXK_0e10oPBZ-ZCcG8-hdfY58K_OIrTFds,366
|
|
43
|
+
quack_kernels-0.2.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
44
|
+
quack_kernels-0.2.6.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
|
|
45
|
+
quack_kernels-0.2.6.dist-info/RECORD,,
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
quack/__init__.py,sha256=0MnyCHBHxG4k12KHNzd-JlElf1M0qRrzhs89PJZJUHA,427
|
|
2
|
-
quack/activation.py,sha256=-lZgojraqdyLjOzgOXBehoVeRBhBq30UX7kOkXsCpGI,20855
|
|
3
|
-
quack/autotuner.py,sha256=atw0ntedi22RPwSdjWOoge4S56S8VFvRocJQcYhpAlo,13454
|
|
4
|
-
quack/broadcast_utils.py,sha256=X5vWg2RtIIWU9Z7nEUW6m0EP0Cfd9XtCKxp4tSyp4Mg,1283
|
|
5
|
-
quack/compile_utils.py,sha256=qJ3oTsDlbAiddrJHtEO7LPYVqn_s-neNfiw-_KvfXZU,591
|
|
6
|
-
quack/copy_utils.py,sha256=IIXtLJv0wQSKfinjIJwG10xQScRvAZvKw1yBV2MXckw,23682
|
|
7
|
-
quack/cross_entropy.py,sha256=w6fjHC_vXt5ji2KfoLrSOdAvpLrQszrYU9rmRij2yY8,24899
|
|
8
|
-
quack/cute_dsl_ptxas.py,sha256=IfBnTJ9amdfDOQkuSdWCLTh7CkZziIvs_xrAc8taxhk,5122
|
|
9
|
-
quack/cute_dsl_utils.py,sha256=4uQx5aYDG9UvVzbWwJTjjJLrnoympz70_CD8b37FQWo,3854
|
|
10
|
-
quack/fast_math.py,sha256=E1XUqfUt0_n9BPZNggF-UDzZ6anso9bYUrwqafemWvQ,2297
|
|
11
|
-
quack/gemm.py,sha256=8V23MPq49QbV3csv-_AxjfE9qf8R3NIqFK9Q9db6t2c,7417
|
|
12
|
-
quack/gemm_act.py,sha256=Y8HJKfw3tCoFKecwhwhd5xpXd9jCQCGZT_V2xXf-CnU,20823
|
|
13
|
-
quack/gemm_config.py,sha256=94o3g9x7H0wi7aBbsb7H67H8nSzTurwL2zgvKDtQUas,3575
|
|
14
|
-
quack/gemm_dact.py,sha256=l__UhCrFbPjD9a1TAVgP7_C7p5lLfX5DkRcM6z0ofOw,7789
|
|
15
|
-
quack/gemm_default_epi.py,sha256=6qO8Ovtcw8sQQ_kXTBTTQ5IHh1lS6RBCGZG0lgLHNrs,11916
|
|
16
|
-
quack/gemm_interface.py,sha256=AF5PYTNgEHjb3MNXcNvvEpOcShAHtak0Xu12l1zrOAw,44804
|
|
17
|
-
quack/gemm_sm100.py,sha256=U9jmzpST_d1W6CBFf1ZHhTtr0K8hENCsUz7dXvHaMZc,122344
|
|
18
|
-
quack/gemm_sm90.py,sha256=u-Q3fN6DPm1fEdz0LcMecMbGTBcRunUCWopufwO8cHU,92015
|
|
19
|
-
quack/gemm_symmetric.py,sha256=mqx7wgOCY6Dh9hjL6gR9PBstMD476GhpA_NkGeaEtik,13349
|
|
20
|
-
quack/gemm_wrapper_utils.py,sha256=EaPyR3Lq19z_RkdB2_xxRj0IPSJMgyfpkrTXyvY3B6M,12775
|
|
21
|
-
quack/layout_utils.py,sha256=qar8x_6VPKOdrz_lAGH_c4W_HKfYLk3Lhtd3Rv1OBBE,12197
|
|
22
|
-
quack/linear.py,sha256=mhN2A98w7H7X4MS63XCCK3gpOm1eS8H7a4WO9ovkt5U,9791
|
|
23
|
-
quack/linear_cross_entropy.py,sha256=Zhy_gdMsKHOie-jntBaqIuiDJtkiq6qEBwnyuWwIRw4,10092
|
|
24
|
-
quack/mlp.py,sha256=YjdwQRwEePA9KyidFXp5H1-lxiJc8dZ41vl8Fv8pgss,2259
|
|
25
|
-
quack/pipeline.py,sha256=3d4D8CPHw7ytZfdH9HFkfDng12YTnGf3pAe2DYxHjK4,11993
|
|
26
|
-
quack/reduce.py,sha256=ySKT2xh1_pIlbJX29BPmwH6yJ7MxIrRZyxHIPPYVpm0,12698
|
|
27
|
-
quack/reduction_base.py,sha256=QqlPs5L2VCxwDrO4CHPq-KY6f_BAYRbvsR6k81LPzTU,3180
|
|
28
|
-
quack/rmsnorm.py,sha256=esy18s5JtT7KBPRPhWf_anLRTrtromwqeJmg2yzOm60,44678
|
|
29
|
-
quack/sm100_utils.py,sha256=-p5qj3Wi9n4WDLy2sl-fApYpGp5rH3JvZQb712OTxPs,1901
|
|
30
|
-
quack/sm90_utils.py,sha256=RLfIZFPhx7Mb9gXwilJ-QSULaj_Q4unaQJA2tFjGIJ4,5545
|
|
31
|
-
quack/softmax.py,sha256=ZqeVbnGfzwkro1LfWBHagbS7B7ug7b9SLZWuGx_Y3Kc,14367
|
|
32
|
-
quack/tensormap_manager.py,sha256=Ts3Mxp0_es2RNA0ffvUjWMXN79lsfWEBZ0DQYhtbcnw,5338
|
|
33
|
-
quack/tile_scheduler.py,sha256=vbKq0xp94eII0uJ63yY_3sgvJkQI7Irc8y1OttO6cRA,42514
|
|
34
|
-
quack/topk.py,sha256=43xHpRGbwZCSRsulmfrG4WA_r2eLHc3sniaUFU7wn-o,22522
|
|
35
|
-
quack/utils.py,sha256=WIttE1iiwyPIwR1NpaeO26Pn9YkZb361TDxFTUDH-IE,7354
|
|
36
|
-
quack/varlen_utils.py,sha256=SOYkomxX2FoqjYlybg99CqNhS9IARM6F9ba2AkIVvT4,15811
|
|
37
|
-
quack/sort/bitonic_sort.py,sha256=-4VmHGmnqRLaVF-IrNhbJqNEJcz-FJT5GuzSWTFeIfI,4831
|
|
38
|
-
quack/sort/generate_sorting_networks.py,sha256=vkJBOjTVEinQkWT4OtFqOWxFVdTIPoNAQocneKc9-rM,14477
|
|
39
|
-
quack/sort/sorting_networks.py,sha256=l_26zi3gXD_z-tnm2eAczRrmE-mbaz00KmqH6ONivL8,9686
|
|
40
|
-
quack/sort/utils.py,sha256=RbubEY1GcEpsjiz_6o5o2WB47IeMOzaajW6Jis0s444,1059
|
|
41
|
-
quack_kernels-0.2.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
42
|
-
quack_kernels-0.2.5.dist-info/METADATA,sha256=5FnKfn7JrhBVjFUILnccs-OL2I8UN6Lqo7QR0i4tAlA,366
|
|
43
|
-
quack_kernels-0.2.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
44
|
-
quack_kernels-0.2.5.dist-info/top_level.txt,sha256=6e4Jr_vNJbZTYwlO_Ahf_sDeHDE0zcqcf7Le11FKxxo,6
|
|
45
|
-
quack_kernels-0.2.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|