PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/quantize/shuffle.py ADDED Viewed

@@ -0,0 +1,306 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+# Helper functions for using MSLK quantized operators.
+from typing import Tuple
+import torch
+from mslk.quantize.triton.fp8_quantize import quantize_fp8_row
+def pack_int4(x: torch.Tensor) -> torch.Tensor:
+    # Given int8 x, pack adjacent int4 values into a single int8.
+    low_x = x[:, ::2]
+    high_x = x[:, 1::2]
+    # High bits need to left shift, this also masks off extra bits.
+    high_x = torch.bitwise_left_shift(high_x, 4)
+    # Low bits need to have sign bits removed.
+    low_x = torch.bitwise_and(low_x, 0xF)
+    # Recombine into a single value with bitwise or.
+    return torch.bitwise_or(low_x, high_x).contiguous()
+def int4_row_quantize_zp(
+    x: torch.Tensor,
+    group_size: int = 128,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    n_bit = 4  # Number of target bits.
+    # Split input into chunks of group_size. This approach allows K that isnt divisible by group_size.
+    to_quant = torch.split(x.to(torch.float), group_size, dim=-1)
+    max_val = [chunk.amax(dim=1, keepdim=True) for chunk in to_quant]
+    min_val = [chunk.amin(dim=1, keepdim=True) for chunk in to_quant]
+    max_int = 2**n_bit - 1
+    min_int = 0
+    scales = [
+        (max_chunk - min_chunk).clamp(min=1e-6) / max_int
+        for max_chunk, min_chunk in zip(max_val, min_val)
+    ]
+    zeros = [
+        min_chunk + scale_chunk * (2 ** (n_bit - 1))
+        for min_chunk, scale_chunk in zip(min_val, scales)
+    ]
+    out = [
+        chunk.sub(min_chunk).div(scale_chunk).round().clamp_(min_int, max_int)
+        for chunk, min_chunk, scale_chunk in zip(to_quant, min_val, scales)
+    ]
+    # Recenter output and move to int8.
+    out = [(chunk - 2 ** (n_bit - 1)).to(dtype=torch.int8) for chunk in out]
+    # Recombine chunks.
+    out = torch.cat(out, dim=-1)
+    # Cutlass expects column major layout for scale and zero point,
+    # so we transpose here and make them contiguous.
+    scales = torch.cat(scales, dim=-1).t().contiguous()
+    zeros = torch.cat(zeros, dim=-1).t().contiguous()
+    return out, scales, zeros
+def int4_row_quantize(
+    x: torch.Tensor,
+    group_size: int = 128,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Helper function to quantize a tensor to int4 with groupwise scales.
+    Args:
+        x (Tensor): [N, K] Higher precision weight tensor to quantize.
+        group_size (int): Number of elements to calculate group scale for.
+    Returns:
+        wq (Tensor): [N, K // 2] Quantized int4 tensor stored in int8 elements.
+        group_scale (Tensor): [K / group_size, N] FP32 Scale per group.
+    """
+    n_bit = 4  # Number of target bits.
+    # Split input into chunks of group_size. This approach allows K that isnt divisible by group_size.
+    to_quant = torch.split(x.to(torch.float), group_size, dim=-1)
+    max_val = [torch.abs(chunk).amax(dim=-1, keepdim=True) for chunk in to_quant]
+    max_int = 2 ** (n_bit - 1)
+    min_int = -(2 ** (n_bit - 1))
+    scales = [chunk.clamp(min=1e-6) / max_int for chunk in max_val]
+    out = [
+        chunk.div(chunk_scale).round().clamp_(min_int, max_int - 1)
+        for chunk, chunk_scale in zip(to_quant, scales)
+    ]
+    # Recombine chunks.
+    out = torch.cat(out, dim=-1)
+    # Cast to int8 and restore shape.
+    out = out.to(dtype=torch.int8)
+    # Scales should be in [num_groups, N] layout.
+    scales = torch.cat(scales, dim=-1).t().contiguous()
+    return out, scales
+def quantize_int4_preshuffle(
+    w: torch.Tensor, group_size: int = 128, dtype: str = "fp8", use_zp: bool = True
+) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Quantizes an input weight tensor to int4 using preshuffling and scale packing.
+    This function is intended to be used with MSLKs mixed dtype kernels and is expected
+    to be applied to weights ahead of time. As such, it is not perfectly optimized.
+    Args:
+        w (Tensor): [N, K] Higher precision weight tensor to quantize. May optionally have a batch dimension.
+        group_size (int): Number of elements to calculate group scale for, must be at least 128.
+        dtype (torch.dtype): Type of corresponding activations. Must be fp8 or bf16.
+        use_zp (bool): If true, uses zero points during weight quantization. Only relevant for bf16 currently.
+    Returns:
+        wq (Tensor): [N, K // 2] Quantized int4 weight tensor packed into int8 elements.
+        scales (Tuple[Tensor]): Scale tensors for the specified activation type. When FP8 is used,
+        scales is a tuple of row_scale ([N]) and group_scale ([K / group_size, 8, N]). When BF16 is
+        used, scales is a tuple of group_scale([K / group_size, N]) and group_zero ([K / group_size, N])
+    """
+    def _quantize(
+        w: torch.Tensor, dtype: str = "fp8"
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if dtype == "fp8":
+            # Start by lowering weights to FP8 and producing row scales.
+            wq, row_scale = quantize_fp8_row(w)
+            # Now reduce to INT4.
+            wq, group_scale = int4_row_quantize(wq, group_size)
+            # Reduce group scale to FP8.
+            group_scale = group_scale.to(torch.float8_e4m3fn)
+            # Take quantized weights and pack them efficiently.
+            wq = pack_int4(wq)
+            # Finally pack weights and scales into efficient preshuffled format.
+            wq, group_scale = torch.ops.mslk.preshuffle_i4(wq, group_scale)
+            return wq, (group_scale, row_scale)
+        elif dtype == "bf16":
+            if use_zp:
+                wq, group_scale, group_zero = int4_row_quantize_zp(w, group_size)
+            else:
+                wq, group_scale = int4_row_quantize(w, group_size)
+                group_zero = torch.zeros_like(group_scale)
+            # Set scales to activation type.
+            group_scale = group_scale.to(torch.bfloat16)
+            group_zero = group_zero.to(torch.bfloat16)
+            # Take quantized weights and pack them efficiently.
+            wq = pack_int4(wq)
+            # Finally pack weights and scales into efficient preshuffled format.
+            wq, group_scale = torch.ops.mslk.preshuffle_i4(wq, group_scale)
+            return wq, (group_scale, group_zero)
+        else:
+            raise NotImplementedError("Only fp8 and bf16 activations supported.")
+    if w.ndim >= 3:
+        orig_shape = w.shape
+        # Flatten to 3 dimensions then iterate over batches.
+        wq, scales = zip(*[_quantize(i, dtype=dtype) for i in w])
+        wq = torch.stack(wq).view(*orig_shape[:-2], *wq[0].shape)
+        # Decompose then stack scales back into a tuple.
+        a_scales, b_scales = zip(*scales)
+        scales = (
+            torch.stack(a_scales).view(*orig_shape[:-2], *a_scales[0].shape),
+            torch.stack(b_scales).view(*orig_shape[:-2], *b_scales[0].shape),
+        )
+    else:
+        wq, scales = _quantize(w, dtype=dtype)
+    return wq, scales
+def shuffle_slice(
+    x: torch.Tensor, dim: int, start: int, length: int, dtype: str = "fp8"
+) -> torch.Tensor:
+    """
+    Helper function to slice a preshuffled int4 tensor. This is needed since the shuffling
+    reorders rows based on the size of the input. Slicing a tensor shuffled for a larger input
+    is no longer valid. We must reorder the tensor to the appropriate size then slice.
+    Args:
+        x (Tensor): [N, K // 2] Preshuffled int4 tensor.
+        dim (int): Dimension to slice.
+        start (int): Start of slice.
+        length (int): Number of elements to slice in the original [N, K] dimension.
+        dtype (str): Type of corresponding activations. Must be fp8 or bf16.
+    Returns:
+        sliced (Tensor): [stop-start, K // 2] Sliced tensor.
+    """
+    # Get the size of the input tensor.
+    assert dim in [x.ndim - 2, x.ndim - 1], "Only slicing along N or K is supported."
+    assert length % 16 == 0, "Slicing must be a multiple of 16."
+    orig_shape = x.shape
+    N = x.shape[-2]
+    K = x.shape[-1]
+    # Tile shape is based on the activation dtype.
+    assert dtype in ("fp8", "bf16"), "Only fp8 and bf16 activations supported."
+    # Handle slice along M
+    if dim == x.ndim - 2:
+        tile_shape = 8 if dtype == "fp8" else 16
+        block_size = N // length
+        # View the shape in terms of shuffled tiles then permute to allow slicing.
+        x_s = x.view(-1, tile_shape, block_size, length // tile_shape, K)
+        x_s = x_s.permute(0, 2, 1, 3, 4).contiguous().view(-1, N, K)
+        out_slice = x_s.narrow(1, start, length)
+        # Reshape back to original shape.
+        return out_slice.view(*orig_shape[:-2], length, K)
+    # Handle slice along K
+    else:
+        outer_dim = x.view(-1, N, K).shape[0]
+        x_s = x.view(outer_dim, -1, length // 2)
+        row_factor = x_s.shape[1] * (length // 2) // K
+        # Take slices of rows corresponding to column slice.
+        return x_s.narrow(1, start * 2 * K // length, row_factor).view(
+            *orig_shape[:-2], N, length // 2
+        )
+def scale_nvfp4_quant(
+    input: torch.Tensor, input_global_scale: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale.
+    This function quantizes the last dimension of the given tensor `input`. For
+    every 16 consecutive elements, a single dynamically computed scaling factor
+    is shared. This scaling factor is quantized using the `input_global_scale`
+    and is stored in a swizzled layout (see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).
+    Args:
+        input: The input tensor to be quantized to FP4
+        input_global_scale: A scalar scaling factor for the entire tensor.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
+            two values are packed into a uint8 and float8_e4m3 scaling factors
+            in the sizzled layout.
+    """
+    assert input.ndim >= 1, f"input.ndim needs to be >= 1, but got {input.ndim}."
+    other_dims = 1 if input.ndim == 1 else -1
+    input = input.reshape(other_dims, input.shape[-1])
+    m, n = input.shape
+    block_size = 16
+    device = input.device
+    assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}."
+    assert input.dtype in (
+        torch.float16,
+        torch.bfloat16,
+    ), f"input.dtype needs to be fp16 or bf16 but got {input.dtype}."
+    # Two fp4 values will be packed into an uint8.
+    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+    # We use the rounded values to store the swizzled values. Due to the
+    # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
+    # So, we first pad the scales to multiples of 128 and 4. Then, the scales
+    # (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
+    def round_up(x: int, y: int) -> int:
+        return (x + y - 1) // y * y
+    rounded_m = round_up(m, 128)
+    scale_n = n // block_size
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty(
+        (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
+    )
+    torch.ops.mslk.scaled_fp4_quant(output, input, output_scale, input_global_scale)
+    output_scale = output_scale.view(torch.float8_e4m3fn)
+    return output, output_scale
+def ck_preshuffle(src: torch.Tensor, NXdl: int = 16) -> torch.Tensor:
+    """
+    Applies shuffling to make weights more efficient for use with CK kernels.
+    Args:
+        src (torch.Tensor): Input tensor with dtype float8_e4m3fnuz.
+        NXdl (int): Wave tile size along N.
+    Returns:
+        torch.Tensor: The shuffled tensor.
+    """
+    # Check input datatype
+    if src.dtype != torch.float8_e4m3fnuz:
+        raise TypeError("Input must be type float8_e4m3fnuz.")
+    N, K = src.shape
+    KPack = 16
+    NLane = NXdl
+    KLane = 64 // NLane
+    K0 = K // (KLane * KPack)
+    # Reshape src to enable the required permutation
+    # Original shape: (N, K)
+    # Desired intermediate shape for permutation: (N0, NLane, K0, KLane, KPack)
+    src = src.reshape(N // NLane, NLane, K0, KLane, KPack)
+    # Apply permutation: (N0, NLane, K0, KLane, KPack) -> (N0, K0, KLane, NLane, KPack)
+    dst = src.permute(0, 2, 3, 1, 4).contiguous()
+    # Reshape to original input shape.
+    dst = dst.reshape(N, K)
+    return dst

mslk/quantize/triton/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict