PyPI - quack-kernels - Versions diffs - 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

quack-kernels 0.1.11py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

quack/__init__.py +7 -3
quack/activation.py +288 -0
quack/autotuner.py +2 -1
quack/cross_entropy.py +325 -175
quack/cute_dsl_utils.py +83 -4
quack/dense_gemm_sm100.py +1 -1
quack/dense_gemm_sm90.py +911 -1140
quack/fast_math.py +10 -27
quack/gemm_act_sm90.py +368 -0
quack/gemm_config.py +43 -35
quack/gemm_dact_sm90.py +150 -0
quack/gemm_interface.py +491 -243
quack/gemm_wrapper_utils.py +158 -0
quack/layernorm.py +5 -3
quack/linear.py +128 -64
quack/linear_cross_entropy.py +275 -0
quack/mlp.py +30 -160
quack/pipeline.py +2 -17
quack/reduce.py +241 -0
quack/reduction_base.py +2 -11
quack/rmsnorm.py +583 -231
quack/softmax.py +27 -15
quack/symmetric_dense_gemm_sm90.py +6 -3
quack/tensormap_manager.py +1 -0
quack/tile_scheduler.py +61 -59
quack/topk.py +14 -8
quack/utils.py +14 -259
quack/varlen_utils.py +22 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/METADATA +2 -2
quack_kernels-0.2.0.dist-info/RECORD +37 -0
quack/lse.py +0 -62
quack_kernels-0.1.11.dist-info/RECORD +0 -31
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/WHEEL +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/licenses/LICENSE +0 -0
{quack_kernels-0.1.11.dist-info → quack_kernels-0.2.0.dist-info}/top_level.txt +0 -0

quack/linear_cross_entropy.py ADDED Viewed

@@ -0,0 +1,275 @@
+# Copyright (c) 2025, Tri Dao
+from typing import Optional, Literal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.amp import custom_fwd, custom_bwd
+from quack.cross_entropy import cross_entropy, cross_entropy_fwd_out
+from quack.gemm_interface import gemm, gemm_add, gemm_add_inplace
+from quack.linear import linear_fwd_convert_type
+def linear_cross_entropy_func(
+    x: Tensor,  # (..., d)
+    weight: Tensor,  # (V, d)
+    bias: Optional[Tensor],  # (V,) or None
+    target: Tensor,  # (...,), int or long
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+    inplace_backward: bool = False,
+) -> Tensor:
+    y = F.linear(x, weight, bias)  # (..., V)
+    return cross_entropy(
+        y, target, ignore_index=ignore_index, reduction=reduction, inplace_backward=inplace_backward
+    )
+def linear_cross_entropy_func_ref(
+    x: Tensor,  # (..., d)
+    weight: Tensor,  # (V, d)
+    bias: Optional[Tensor],  # (V,) or None
+    target: Tensor,  # (...,), int or long
+    ignore_index: int = -100,
+    reduction: Literal["none", "mean", "sum"] = "mean",
+) -> Tensor:
+    y = F.linear(x, weight, bias)  # (..., V)
+    return F.cross_entropy(y, target, ignore_index=ignore_index, reduction=reduction)
+def chunked_linear_cross_entropy_fwd(
+    x: Tensor,  # (B*L, d) where B is batch, L is seqlen
+    weight: Tensor,  # (V, d) where V is vocab size
+    target: Tensor,  # (B*L,)
+    chunk_size: int = 4096,
+    ignore_index: int = -100,
+    tuned: bool = True,
+) -> tuple[Tensor, Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+    """
+    Chunked forward pass for linear cross entropy.
+    Splits input along batch dimension, computes matmul and cross_entropy_fwd
+    for each chunk, stores dx for each chunk, and accumulates dw.
+    Returns:
+        loss: (B*L,) loss values
+        dx: (B*L, d) gradient w.r.t. input
+        dw: (V, d) gradient w.r.t. weight (accumulated across chunks except last)
+        last_dlogits_chunk: (chunk_len, V) gradient of last chunk's logits (for deferred dw computation)
+        last_x_chunk: (chunk_len, d) last chunk's input (for deferred dw computation)
+    """
+    B_L, d = x.shape
+    V, _ = weight.shape
+    device = x.device
+    num_chunks = (B_L + chunk_size - 1) // chunk_size
+    # Since we use gemm with TMA we require some alignment
+    assert chunk_size % 8 == 0, "chunk_size must be multiple of 8"
+    assert B_L % 8 == 0
+    # Pre-allocate outputs
+    loss = torch.empty(B_L, device=device, dtype=torch.float32)
+    logits_chunk_preallocated = torch.empty((chunk_size, V), device=device, dtype=x.dtype)
+    dx = torch.empty_like(x)
+    # Last chunk of dw will be deferred to the backward pass
+    dw = torch.empty_like(weight, dtype=torch.float32) if num_chunks > 1 else None
+    last_dlogits_chunk = None
+    last_x_chunk = None
+    # Process in chunks
+    for i, (x_chunk, target_chunk, loss_chunk, dx_chunk) in enumerate(
+        zip(*(t.split(chunk_size) for t in (x, target, loss, dx)))
+    ):
+        chunk_len = x_chunk.shape[0]
+        logits_chunk = logits_chunk_preallocated[:chunk_len]  # (chunk_len, V)
+        torch.mm(x_chunk, weight.mT, out=logits_chunk)
+        # Compute cross entropy forward with gradients
+        dlogits_chunk = logits_chunk  # inplace_backward
+        cross_entropy_fwd_out(
+            logits_chunk,
+            target_chunk,
+            None,  # target_logit
+            loss=loss_chunk,
+            lse=None,  # we don't need lse here
+            dx=dlogits_chunk,
+            ignore_index=ignore_index,
+        )
+        # Compute dx for this chunk: dlogits @ weight
+        torch.mm(dlogits_chunk, weight, out=dx_chunk)  # (chunk_len, d)
+        # Compute dw for all chunks except the last
+        if i == num_chunks - 1:
+            # Last chunk: save for backward pass
+            last_dlogits_chunk = dlogits_chunk
+            last_x_chunk = x_chunk
+        elif i == 0:
+            # First chunk: dw = dlogits.T @ x_chunk
+            gemm(dlogits_chunk.T, x_chunk, out=dw, tuned=tuned)
+        else:
+            # Middle chunks: dw += dlogits.T @ x_chunk
+            gemm_add_inplace(dlogits_chunk.T, x_chunk, dw, tuned=tuned)
+    return loss, dx, dw, last_dlogits_chunk, last_x_chunk
+class ChunkedLinearCrossEntropyFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        x: Tensor,
+        weight: Tensor,
+        target: Tensor,
+        ignore_index: int = -100,
+        reduction: Literal["mean", "sum"] = "mean",
+        chunk_size: int = 4096,
+        tuned: bool = True,
+    ):
+        """
+        Forward pass computes loss and stores dx and dw for backward.
+        """
+        ctx.weight_dtype = weight.dtype
+        x, weight = linear_fwd_convert_type(x, weight)
+        batch_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        # TODO: don't need to compute bwd if neither x nor weight requires grad, or not training
+        loss, dx, dw, last_dlogits_chunk, last_x_chunk = chunked_linear_cross_entropy_fwd(
+            x, weight, target, chunk_size, ignore_index, tuned=tuned
+        )
+        loss_sum = loss.sum()
+        loss_scale = None if reduction == "sum" else 1.0 / (target != ignore_index).sum().float()
+        ctx.save_for_backward(dx, dw, last_dlogits_chunk, last_x_chunk, loss_scale)
+        ctx.batch_shape = batch_shape
+        ctx.ignore_index = ignore_index
+        ctx.reduction = reduction
+        ctx.tuned = tuned
+        return loss_sum if loss_scale is None else loss_sum * loss_scale
+    @staticmethod
+    @custom_bwd(device_type="cuda")
+    def backward(ctx, dloss):
+        """
+        Backward pass scales pre-computed gradients by dloss and completes
+        the last chunk's dw computation.
+        dloss is a scalar.
+        """
+        dx, dw, last_dlogits_chunk, last_x_chunk, loss_scale = ctx.saved_tensors
+        tuned = ctx.tuned
+        if loss_scale is not None:
+            dloss = dloss * loss_scale
+        # TODO: the case where x or weight doesn't require grad
+        dx.mul_(dloss)
+        dx = dx.reshape(*ctx.batch_shape, dx.shape[-1])
+        # Complete dw computation: dw = dloss * dw + dloss * (last_dlogits_chunk.T @ last_x_chunk)
+        if dw is None:
+            # Only had one chunk, compute dw directly with dloss scaling
+            dw = gemm(
+                last_dlogits_chunk.T,
+                last_x_chunk,
+                out_dtype=ctx.weight_dtype,
+                alpha=dloss,
+                tuned=tuned,
+            )
+        else:
+            # Add last chunk's contribution with dloss scaling
+            # dw = dloss * dw + dloss * (last_dlogits_chunk.T @ last_x_chunk)
+            # We use alpha=dloss, beta=dloss
+            if ctx.weight_dtype == dw.dtype:
+                gemm_add_inplace(
+                    last_dlogits_chunk.T, last_x_chunk, dw, alpha=dloss, beta=dloss, tuned=tuned
+                )
+            else:
+                dw = gemm_add(
+                    last_dlogits_chunk.T,
+                    last_x_chunk,
+                    dw,
+                    alpha=dloss,
+                    beta=dloss,
+                    out_dtype=ctx.weight_dtype,
+                    tuned=tuned,
+                )
+        return dx, dw, None, None, None, None, None
+def chunked_linear_cross_entropy(
+    x: Tensor,
+    weight: Tensor,
+    target: Tensor,
+    chunk_size: int = 4096,
+    ignore_index: int = -100,
+    reduction: Literal["mean", "sum"] = "mean",
+    tuned: bool = True,
+) -> Tensor:
+    """
+    Chunked linear cross entropy with automatic differentiation support.
+    Args:
+        x: Input tensor of shape (B*L, d)
+        weight: Weight tensor of shape (V, d)
+        target: Target indices of shape (B*L,)
+        chunk_size: Size of chunks to process
+        ignore_index: Index to ignore in loss computation
+        reduction: Type of reduction to apply
+        tuned: Whether to use tuned kernels
+    Returns:
+        Loss tensor with specified reduction
+    """
+    if reduction not in ["mean", "sum"]:
+        raise ValueError(f"Invalid reduction: {reduction}")
+    loss = ChunkedLinearCrossEntropyFunction.apply(
+        x, weight, target, ignore_index, reduction, chunk_size, tuned
+    )
+    return loss
+class LinearCrossEntropy(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        ignore_index: int = -100,
+        reduction: Literal["none", "mean", "sum"] = "mean",
+        chunk_size: Optional[int] = None,
+        inplace_backward: bool = False,
+        tuned: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.chunk_size = chunk_size
+        self.inplace_backward = inplace_backward
+        self.tuned = tuned
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        if (
+            self.bias is None
+            and input.is_cuda
+            and input.stride(-1) == 1
+            and self.in_features % 8 == 0
+            and self.out_features % 8 == 0
+            and input.shape[:-1].numel() % 8 == 0
+            and self.chunk_size is not None
+            and self.chunk_size % 8 == 0
+            and self.reduction in ["mean", "sum"]
+        ):
+            return chunked_linear_cross_entropy(
+                input,
+                self.weight,
+                target,
+                chunk_size=self.chunk_size,
+                ignore_index=self.ignore_index,
+                reduction=self.reduction,
+                tuned=self.tuned,
+            )
+        else:
+            return linear_cross_entropy_func(
+                input,
+                self.weight,
+                self.bias,
+                target,
+                ignore_index=self.ignore_index,
+                reduction=self.reduction,
+                inplace_backward=self.inplace_backward,
+            )

quack/mlp.py CHANGED Viewed

@@ -3,131 +3,31 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.amp import custom_fwd, custom_bwd
-from einops import rearrange
+from quack.linear import linear_act_func, act_linear_func
-from gemm_cublas import gemm as gemm_cb, gemm_add_ as gemm_add_cb_
-# from gemm_cublas.interface import gemm_tuned as gemm_cb, gemm_add_tuned_ as gemm_add_cb_
-from quack import gemm, gemm_swiglu, gemm_dswiglu  # TODO: implement these
+def mlp_func(x, weight1, weight2, activation: str, fuse_grad_accum=False, tuned=True):
+    preact, postact = linear_act_func(
+        x,
+        weight1,
+        activation,
+        store_preact=torch.is_grad_enabled(),
+        fuse_grad_accum=fuse_grad_accum,
+        tuned=tuned,
+    )
+    out = act_linear_func(
+        preact,
+        weight2,
+        postact,
+        activation=activation,
+        fuse_grad_accum=fuse_grad_accum,
+        tuned=tuned,
+    )
+    return out
-class MLPSwiGLUFunc(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(device_type="cuda")
-    def forward(ctx, x, weight1, weight2, fuse_grad_accum=False):
-        """
-        x: (..., in_features)
-        weight1: (2 * intermediate_features, in_features)
-        weight2: (out_features, intermediate_features)
-        out: (..., out_features)
-        Note that we do swiglu on the even and odd indices of the intermediate output,
-        i.e. silu(y[..., ::2]) * y[..., 1::2].
-        This is different from the usual swiglu implementation that does: y1, y2 = y.chunk(2, dim=-1); silu(y1) * y2
-        """
-        needs_weight1_grad = weight1.requires_grad
-        needs_weight2_grad = weight2.requires_grad
-        needs_input_grad = x.requires_grad
-        ctx.weight1_dtype = weight1.dtype
-        ctx.weight2_dtype = weight2.dtype
-        autocast_dtype = torch.get_autocast_dtype("cuda")
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=autocast_dtype)
-        weight1_og = weight1
-        weight2_og = weight2
-        if torch.is_autocast_enabled():
-            weight1 = weight1.to(dtype=autocast_dtype)
-            weight2 = weight2.to(dtype=autocast_dtype)
-        batch_shape = x.shape[:-1]
-        x = x.reshape(-1, x.shape[-1])
-        # don't need preact if not computing gradient
-        store_preact = needs_input_grad or needs_weight1_grad or needs_weight2_grad
-        # (batch, inter_dim) & (batch, 2 * inter_dim)
-        y, preact = gemm_swiglu(x, weight1.T, store_preact=store_preact)
-        # out = F.linear(y, weight2)
-        out = gemm(y, weight2.T)
-        if not needs_input_grad:
-            weight1, weight1_og = None, None
-        if not needs_weight1_grad:
-            x = None
-        if not needs_input_grad and not needs_weight1_grad and not needs_weight2_grad:
-            weight2, weight2_og = None, None
-            preact = None
-        ctx.save_for_backward(
-            x,
-            preact,
-            weight1,
-            weight2,
-            *((weight1_og, weight2_og) if fuse_grad_accum else (None, None)),
-        )
-        ctx.fuse_grad_accum = fuse_grad_accum
-        return out.reshape(*batch_shape, out.shape[-1])
-    @staticmethod
-    @custom_bwd(device_type="cuda")
-    def backward(ctx, dout):
-        """
-        dout: (..., out_features)
-        """
-        if not torch.compiler.is_dynamo_compiling():
-            assert dout.stride(-1) == 1
-        # weight1_og and weight2_og are None if not ctx.fused_grad_accum
-        x, preact, weight1, weight2, weight1_og, weight2_og = ctx.saved_tensors
-        batch_shape = dout.shape[:-1]
-        dout = dout.reshape(-1, dout.shape[-1])
-        if (
-            not ctx.needs_input_grad[0]
-            and not ctx.needs_weight1_grad[0]
-            and not ctx.needs_weight2_grad[0]
-        ):
-            return (None,) * 4
-        assert preact is not None
-        # (batch, 2 * inter_dim) and (batch, inter_dim)
-        # dpreact, y = gemm_dswiglu(dout, weight2, preact)
-        dpreact, y = gemm_dswiglu(dout, weight2, preact, sm_carveout=16)
-        if ctx.needs_input_grad[2]:
-            # fuse_grad_accum is not compatible with torch.compile
-            if not ctx.fuse_grad_accum or weight2_og.grad is None or torch.compiler.is_compiling():
-                dweight2 = gemm_cb(dout.T, y, out_dtype=ctx.weight2_dtype)
-                # dweight2 = gemm_cb(dout.T, y, out_dtype=ctx.weight2_dtype, sm_carveout=16)
-            else:
-                # print("Using fuse grad accum in MLP 2", dout.shape, y.shape, weight2_og.grad.shape)
-                gemm_add_cb_(dout.T, y, weight2_og.grad)
-                # gemm_add_cb_(dout.T, y, weight2_og.grad, sm_carveout=16)
-                dweight2 = weight2_og.grad
-                weight2_og.grad = (
-                    None  # So that pytorch doesn't add dweight to weight2_og.grad again
-                )
-        else:
-            dweight2 = None
-        if ctx.needs_input_grad[0]:
-            dx = dpreact @ weight1  # (batch, in_features)
-            # dx = gemm(dpreact, weight1)  # (batch, in_features)
-            dx = dx.reshape(*batch_shape, dx.shape[-1])
-        else:
-            dx = None
-        if ctx.needs_input_grad[1]:
-            # fuse_grad_accum is not compatible with torch.compile
-            if not ctx.fuse_grad_accum or weight1_og.grad is None or torch.compiler.is_compiling():
-                dweight1 = gemm_cb(dpreact.T, x, out_dtype=ctx.weight1_dtype)
-            else:
-                # print("Using fuse grad accum in MLP 1", dpreact.shape, x.shape, weight1_og.grad.shape)
-                gemm_add_cb_(dpreact.T, x, weight1_og.grad)
-                dweight1 = weight1_og.grad
-                weight1_og.grad = (
-                    None  # So that pytorch doesn't add dweight to weight1_og.grad again
-                )
-        else:
-            dweight1 = None
-        return dx, dweight1, dweight2, None
-def mlp_swiglu_func(x, weight1, weight2, fuse_grad_accum=False):
-    return MLPSwiGLUFunc.apply(x, weight1, weight2, fuse_grad_accum)
-class MLPSwiGLU(nn.Module):
+class MLP(nn.Module):
     def __init__(
         self,
         in_features,
@@ -135,25 +35,21 @@ class MLPSwiGLU(nn.Module):
         out_features=None,
         bias1=False,
         bias2=False,
-        multiple_of=128,
+        activation="gelu",
         device=None,
         dtype=None,
         fuse_grad_accum: bool = False,
+        tuned: bool = True,
     ):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         out_features = out_features if out_features is not None else in_features
-        hidden_features = (
-            hidden_features if hidden_features is not None else int(8 * in_features / 3)
-        )
-        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
-        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
-        self.fc1.weight._muon_reshape_functions = (
-            lambda w: rearrange(w, "(d two) e -> two d e", two=2),
-            lambda w: rearrange(w, "two d e -> (d two) e"),
-        )
+        hidden_features = hidden_features if hidden_features is not None else 4 * in_features
+        self.activation = activation
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
         self.fuse_grad_accum = fuse_grad_accum
+        self.tuned = tuned
     def forward(self, input: Tensor) -> Tensor:
         if (
@@ -162,43 +58,17 @@ class MLPSwiGLU(nn.Module):
             and input.is_cuda
             and input.stride(-1) == 1
             and self.fc1.in_features % 8 == 0
-            and self.fc1.out_features % 16 == 0
+            and self.fc1.out_features % 8 == 0
             and self.fc2.out_features % 8 == 0
         ):
-            return mlp_swiglu_func(
+            return mlp_func(
                 input,
                 self.fc1.weight,
                 self.fc2.weight,
+                activation=self.activation,
                 fuse_grad_accum=self.fuse_grad_accum,
+                tuned=self.tuned,
             )
         else:
             y = self.fc1(input)
             return self.fc2(F.silu(y[..., ::2]) * y[..., 1::2])
-class MLPSwiGLURef(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        bias1=False,
-        bias2=False,
-        multiple_of=128,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        out_features = out_features if out_features is not None else in_features
-        hidden_features = (
-            hidden_features if hidden_features is not None else int(8 * in_features / 3)
-        )
-        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
-        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
-    def forward(self, input: Tensor) -> Tensor:
-        y = self.fc1(input)
-        y1, y2 = y.chunk(2, dim=-1)
-        return self.fc2(F.silu(y1) * y2)

quack/pipeline.py CHANGED Viewed

@@ -8,21 +8,6 @@ from cutlass.cutlass_dsl import Boolean, Int32, if_generate
 from cutlass.pipeline import CooperativeGroup, PipelineOp, pipeline_init_wait
 from cutlass.pipeline import PipelineAsync, PipelineTmaAsync, PipelineState, PipelineUserType
-from cutlass.cutlass_dsl import dsl_user_op
-from cutlass._mlir.dialects import nvvm
-@dsl_user_op
-def cp_async_mbarrier_arrive_shared(
-    mbar_ptr: cute.Pointer, noinc: bool = False, *, loc=None, ip=None
-) -> None:
-    nvvm.cp_async_mbarrier_arrive_shared(
-        mbar_ptr.llvm_ptr,
-        noinc=noinc,
-        loc=loc,
-        ip=ip,
-    )
 class PipelineStateWAdvance(PipelineState):
     def advance_iters(self, num_iterations: Int32):
@@ -65,7 +50,7 @@ def make_pipeline_state(type: PipelineUserType, stages: int):
 @dataclass(frozen=True)
 class PipelineTmaCpAsync(PipelineTmaAsync):
     """
-    PipelineTmaCpAsync is used for CpAync + TMA producers and AsyncThread consumers
+    PipelineTmaCpAsync is used for CpAsync + TMA producers and AsyncThread consumers
     """
     @staticmethod
@@ -163,4 +148,4 @@ class PipelineTmaCpAsync(PipelineTmaAsync):
         """
         We need the mbarrier to track the completion of cp.async
         """
-        cp_async_mbarrier_arrive_shared(self.producer_get_barrier(state), noinc=True)
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.producer_get_barrier(state))

quack-kernels 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

quack-kernels 0.1.11py3-none-any.whl → 0.2.0py3-none-any.whl