PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/moe/shuffling.py ADDED Viewed

@@ -0,0 +1,421 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Optional, Union
+import torch
+import triton
+import triton.language as tl
+# Function APIs
+def combine_shuffling(
+    tokens: torch.Tensor,
+    token_counts: torch.Tensor,
+    expert_start: Optional[int] = None,
+    expert_end: Optional[int] = None,
+    is_padded: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # pyre-ignore
+    return _combine_or_split_shuffling(
+        tokens=tokens,
+        token_counts=token_counts,
+        expert_start=expert_start,
+        expert_end=expert_end,
+        is_padded=is_padded,
+        is_combine=True,
+    )
+def split_shuffling(
+    tokens: torch.Tensor,
+    token_counts: torch.Tensor,
+    expert_start: Optional[int] = None,
+    expert_end: Optional[int] = None,
+    is_padded: bool = False,
+    init_with_zeros: bool = False,
+) -> torch.Tensor:
+    # pyre-ignore
+    return _combine_or_split_shuffling(
+        tokens=tokens,
+        token_counts=token_counts,
+        expert_start=expert_start,
+        expert_end=expert_end,
+        is_padded=is_padded,
+        is_combine=False,
+        init_with_zeros=init_with_zeros,
+    )
+def _combine_or_split_shuffling(
+    tokens: torch.Tensor,
+    token_counts: torch.Tensor,
+    expert_start: Optional[int],
+    expert_end: Optional[int],
+    is_padded: bool,
+    is_combine: bool,
+    init_with_zeros: bool = False,
+) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    # T is intentionally ignored in kernel interface to avoid recompilation
+    assert tokens.is_contiguous()
+    assert token_counts.is_contiguous()
+    T, D = tokens.shape
+    EP, E = token_counts.shape
+    B_T = -1
+    if is_padded:
+        assert T % EP == 0
+        B_T = T // EP
+    if expert_start is None:
+        expert_start = 0
+    if expert_end is None:
+        expert_end = E
+    EG: int = expert_end - expert_start
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    SPLIT_D = max(NUM_SMS // (EP * EG), 1)
+    SPLIT_D = triton.next_power_of_2(SPLIT_D + 1)
+    if T <= 1024:
+        SPLIT_D //= 2
+    if is_combine:
+        grid = (EP * EG * SPLIT_D + 1,)
+    else:
+        grid = (EP * EG * SPLIT_D,)
+    output_tokens = (
+        torch.zeros_like(tokens) if init_with_zeros else torch.empty_like(tokens)
+    )
+    if is_combine:
+        output_token_counts = torch.empty(
+            EG + 1, dtype=token_counts.dtype, device=token_counts.device
+        )
+    else:
+        output_token_counts = None
+    BLOCK_E = max(triton.next_power_of_2(E), 8)
+    BLOCK_EG = max(triton.next_power_of_2(EG), 8)
+    BLOCK_EP = max(triton.next_power_of_2(EP), 8)
+    _mslk_combine_or_split_shuffling[grid](
+        tokens,
+        token_counts,
+        output_tokens,
+        output_token_counts,
+        is_combine,
+        expert_start,
+        is_padded,
+        B_T,
+        EG,
+        EP,
+        E,
+        D,
+        BLOCK_E,
+        BLOCK_EG,
+        BLOCK_EP,
+        SPLIT_D,
+    )
+    if is_combine:
+        assert output_token_counts is not None
+        return output_tokens, output_token_counts
+    else:
+        return output_tokens
+# Torch Custom Op Registrations
+_COMBINE_SHUFFLING_OP_NAME = "mslk::combine_shuffling"
+torch.library.define(
+    _COMBINE_SHUFFLING_OP_NAME,
+    "(Tensor tokens, Tensor token_counts, int? expert_start = None, int? expert_end = None, bool? is_padded = False) -> (Tensor, Tensor)",
+)
+@torch.library.impl(_COMBINE_SHUFFLING_OP_NAME, "Meta")
+def combine_shuffling_meta(
+    tokens,
+    token_counts,
+    expert_start,
+    expert_end,
+    is_padded,
+):
+    _, E = token_counts.shape
+    if expert_start is None:
+        expert_start = 0
+    if expert_end is None:
+        expert_end = E
+    EG: int = expert_end - expert_start
+    output_tokens = torch.empty_like(tokens)
+    output_token_counts = torch.empty(
+        EG + 1, dtype=token_counts.dtype, device=token_counts.device
+    )
+    return output_tokens, output_token_counts
+@torch.library.impl(_COMBINE_SHUFFLING_OP_NAME, "CUDA")
+def combine_shuffling_cuda(
+    tokens,
+    token_counts,
+    expert_start=None,
+    expert_end=None,
+    is_padded=False,
+):
+    return combine_shuffling(
+        tokens,
+        token_counts,
+        expert_start,
+        expert_end,
+        is_padded,
+    )
+_SPLIT_SHUFFLING_OP_NAME = "mslk::split_shuffling"
+torch.library.define(
+    _SPLIT_SHUFFLING_OP_NAME,
+    "(Tensor tokens, Tensor token_counts, int? expert_start = None, int? expert_end = None, bool? is_padded = False, bool? init_with_zeros = False) -> Tensor",
+)
+@torch.library.impl(_SPLIT_SHUFFLING_OP_NAME, "Meta")
+def split_shuffling_meta(
+    tokens,
+    token_counts,
+    expert_start,
+    expert_end,
+    is_padded,
+):
+    output_tokens = torch.empty_like(tokens)
+    return output_tokens
+@torch.library.impl(_SPLIT_SHUFFLING_OP_NAME, "CUDA")
+def split_shuffling_cuda(
+    tokens,
+    token_counts,
+    expert_start=None,
+    expert_end=None,
+    is_padded=False,
+):
+    return split_shuffling(
+        tokens,
+        token_counts,
+        expert_start,
+        expert_end,
+        is_padded,
+    )
+# Kernel Implementations
+_NV_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_T": block_t,
+            "BLOCK_D": block_d,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+        num_ctas=num_ctas,
+    )
+    for block_t in [32, 64]
+    for block_d in [256, 512, 1024]
+    for num_stages in [1, 3]
+    for num_warps in [8, 16]
+    for num_ctas in [1]
+]
+_AMD_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_T": block_t,
+            "BLOCK_D": block_d,
+            "waves_per_eu": waves_per_cu,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+    )
+    for block_t in [32, 64]
+    for block_d in [256, 512, 1024]
+    for num_stages in [1, 3]
+    for num_warps, waves_per_cu in [(8, 2), (16, 4)]
+]
+@triton.autotune(
+    configs=_AMD_CONFIGS if torch.version.hip else _NV_CONFIGS,
+    key=[
+        "COMBINE",
+        "EG",
+        "EP",
+        "E",
+        "D",
+    ],
+)
+@triton.jit
+def _mslk_combine_or_split_shuffling(
+    input_tokens_ptr,
+    input_token_counts_ptr,
+    output_tokens_ptr,
+    output_token_counts_ptr,
+    COMBINE: tl.constexpr,
+    EG_START,
+    PADDED,
+    B_T: tl.constexpr,
+    EG: tl.constexpr,
+    EP: tl.constexpr,
+    E: tl.constexpr,
+    D: tl.constexpr,
+    BLOCK_E: tl.constexpr,
+    BLOCK_EG: tl.constexpr,
+    BLOCK_EP: tl.constexpr,
+    SPLIT_D: tl.constexpr,
+    BLOCK_T: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+) -> None:
+    """
+    tokens: [T, D]
+    input_token_counts: [EP, E]
+    output_tokens: [T, D]
+    output_token_counts: [E]
+    """
+    tidx = tl.program_id(0)
+    NUM_D_BLOCKS: tl.constexpr = (D + SPLIT_D * BLOCK_D - 1) // (SPLIT_D * BLOCK_D)
+    rank = tidx // (EG * SPLIT_D)
+    local_expert = (tidx % (EG * SPLIT_D)) // SPLIT_D
+    didx = tidx % SPLIT_D
+    # All experts in communication group
+    offs_e = tl.arange(0, BLOCK_E)
+    # Local experts
+    offs_eg = tl.arange(0, BLOCK_EG)
+    # Ranks
+    offs_ep = tl.arange(0, BLOCK_EP)
+    global_expert = local_expert + EG_START
+    input_token_counts = tl.load(
+        input_token_counts_ptr + offs_ep[:, None] * E + offs_e[None, :],
+        eviction_policy="evict_last",
+        mask=((offs_ep[:, None] < EP) & (offs_e[None, :] < E)),
+        other=0,
+    )  # [EP, E]
+    if E == EG:
+        input_token_counts_eg = input_token_counts
+    else:
+        input_token_counts_eg = tl.load(
+            input_token_counts_ptr + offs_ep[:, None] * E + EG_START + offs_eg[None, :],
+            eviction_policy="evict_last",
+            mask=((offs_ep[:, None] < EP) & (offs_eg[None, :] < EG)),
+            other=0,
+        )  # [EP, EG]
+    if COMBINE:
+        LAST_TILE: tl.constexpr = EP * EG * SPLIT_D
+        if tidx == LAST_TILE:
+            output_token_counts_eg = tl.sum(input_token_counts_eg, axis=0)
+            tl.store(
+                output_token_counts_ptr + offs_eg,
+                output_token_counts_eg,
+                mask=(offs_eg < EG),
+            )
+            output_token_counts_eg = tl.sum(output_token_counts_eg)
+            tl.store(output_token_counts_ptr + EG, output_token_counts_eg)
+            return
+    cond0 = offs_ep[:, None] < rank
+    cond1 = offs_ep[:, None] == rank
+    cond2 = offs_e[None, :] < global_expert
+    if PADDED:
+        tl.device_assert(B_T >= 0)
+        # Only need information from previous experts in the same rank.
+        ep_first_order = (
+            tl.sum(tl.where(cond1 and cond2, input_token_counts, 0)) + B_T * rank
+        )
+    else:
+        # r < rank || (r == rank && e < expert)
+        ep_first_order = tl.sum(
+            tl.where(cond0 or (cond1 and cond2), input_token_counts, 0)
+        )
+    cond4 = offs_eg[None, :] < local_expert
+    cond5 = offs_eg[None, :] == local_expert
+    # Expert first only need information from local experts across ranks.
+    # e < expert || (e == expert && r < rank)
+    expert_first_order = tl.sum(
+        tl.where(cond4 or (cond5 and cond0), input_token_counts_eg, 0)
+    )
+    if COMBINE:
+        input_offset = ep_first_order
+        output_offset = expert_first_order
+    else:
+        input_offset = expert_first_order
+        output_offset = ep_first_order
+    input_offset = input_offset.to(tl.int64)
+    output_offset = output_offset.to(tl.int64)
+    num_copy_tokens = tl.load(input_token_counts_ptr + rank * E + global_expert)
+    if num_copy_tokens == 0:
+        return
+    STEP_D: tl.constexpr = SPLIT_D * BLOCK_D
+    MASK_D: tl.constexpr = D % STEP_D != 0
+    num_t_blocks = tl.cdiv(num_copy_tokens, BLOCK_T)
+    t_1d_ptr = tl.arange(0, BLOCK_T)[:, None]
+    ti_1d_ptr = input_offset + t_1d_ptr
+    to_1d_ptr = output_offset + t_1d_ptr
+    d_1d_ptr = didx * NUM_D_BLOCKS * BLOCK_D + tl.arange(0, BLOCK_D)[None, :]
+    i_2d_ptr = input_tokens_ptr + ti_1d_ptr * D + d_1d_ptr
+    o_2d_ptr = output_tokens_ptr + to_1d_ptr * D + d_1d_ptr
+    for i in range(num_t_blocks * NUM_D_BLOCKS):
+        mask = t_1d_ptr < num_copy_tokens
+        if MASK_D:
+            mask &= d_1d_ptr < D
+        block = tl.load(
+            i_2d_ptr,
+            mask=mask,
+        )
+        tl.store(
+            o_2d_ptr,
+            value=block,
+            mask=mask,
+        )
+        if i % NUM_D_BLOCKS == (NUM_D_BLOCKS - 1):  # pyre-ignore
+            # just to make sure constant folding happens
+            D_1D_SHIFT: tl.constexpr = -(NUM_D_BLOCKS - 1) * BLOCK_D
+            TD_2D_SHIFT: tl.constexpr = BLOCK_T * D + D_1D_SHIFT
+            # increment T, D
+            t_1d_ptr += BLOCK_T
+            i_2d_ptr += TD_2D_SHIFT
+            o_2d_ptr += TD_2D_SHIFT
+            if MASK_D:
+                d_1d_ptr += D_1D_SHIFT
+        else:
+            # increment D
+            i_2d_ptr += BLOCK_D
+            o_2d_ptr += BLOCK_D
+            if MASK_D:
+                d_1d_ptr += BLOCK_D

mslk/mslk.so ADDED Viewed

Binary file

mslk/quantize/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from mslk.utils.torch.library import load_library_buck
+load_library_buck("//mslk/csrc/quantize:quantize_ops")