PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/flash_attn/pipeline.py ADDED Viewed

@@ -0,0 +1,273 @@
+# @nolint # fbcode
+# Copyright (c) 2025, Tri Dao.
+# import math
+from typing import Optional
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Boolean, Int32, const_expr
+from cutlass.cutlass_dsl import if_generate
+from cutlass.pipeline import PipelineAsync, PipelineState, Agent, CooperativeGroup
+from cutlass.pipeline import PipelineUserType, PipelineOp
+from cutlass.pipeline import PipelineTmaAsync as PipelineTmaAsyncOg
+from cutlass.pipeline import PipelineTmaUmma as PipelineTmaUmmaOg
+# We deviate from cute-dsl implementation to use cute.arch.cluster_arrive_relaxed
+def pipeline_init_wait(cta_layout_vmnk: Optional[cute.Layout] = None):
+    """
+    Fences the mbarrier init and syncs the threadblock or cluster
+    """
+    cute.arch.mbarrier_init_fence()
+    if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+        # If not using clusters, sync the threadblock
+        _sync(Agent.ThreadBlock)
+    else:
+        # If using clusters, sync the cluster
+        _sync(Agent.ThreadBlockCluster)
+def _sync(group: Agent):
+    """
+    Syncs all threads within an agent.
+    """
+    if group is Agent.Thread:
+        raise NotImplementedError("Error: Not supported.")
+    elif group is Agent.ThreadBlock:
+        cute.arch.sync_threads()
+    elif group is Agent.ThreadBlockCluster:
+        cute.arch.cluster_arrive_relaxed()
+        cute.arch.cluster_wait()
+    else:
+        assert False, (
+            "Error: No explicit sync instruction exists. Please use barriers (named / mbarrier) instead."
+        )
+class PipelineStateSimple:
+    """
+    Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
+    Use a single Int32 to store both the index and phase bit, then we use divmod to get the
+    index and phase. If stages is a power of 2, divmod turns into bit twiddling.
+    """
+    def __init__(self, stages: int, phase_index: Int32):
+        # assert stages < 2**16
+        # self._log_stages = int(math.log2(stages))
+        # assert 1 << self._log_stages == stages, "Number of stages must be a power of 2."
+        self._stages = stages
+        self._phase_index = phase_index
+    def clone(self) -> "PipelineStateSimple":
+        return PipelineStateSimple(self.stages, self._phase_index)
+    @property
+    def stages(self) -> int:
+        # return 1 << self._log_stages
+        return self._stages
+    @property
+    def index(self) -> Int32:
+        # return self._phase_index & 0xFFFF
+        # return self._phase_index & ((1 << self._log_stages) - 1)
+        if const_expr(self._stages == 1):
+            return Int32(0)
+        else:
+            return self._phase_index % self._stages
+    @property
+    def phase(self) -> Int32:
+        # return self._phase_index >> 16
+        # PTX docs say that the phase parity needs to be 0 or 1, so by right we need to
+        # take modulo 2. But in practice just passing the phase in without modulo works fine.
+        # return (self._phase_index >> self._log_stages) % 2
+        # return self._phase_index >> self._log_stages
+        if const_expr(self._stages == 1):
+            return self._phase_index
+        else:
+            return self._phase_index // self._stages
+    def advance(self):
+        if const_expr(self._stages == 1):
+            self._phase_index ^= 1
+        else:
+            self._phase_index += 1
+        # def then_body(phase_index):
+        #     # XOR the phase bit and set the index to 0
+        #     return (phase_index & 0xFFFF0000) ^ (1 << 16)
+        # def else_body(phase_index):
+        #     return phase_index
+        # self._phase_index = if_generate(
+        #     (self._phase_index & 0xFFFF) == self.stages,
+        #     then_body,
+        #     else_body,
+        #     [self._phase_index],
+        #     [Int32],
+        # )
+    def __extract_mlir_values__(self):
+        phase_index = self._phase_index
+        return [phase_index.ir_value()]
+    def __new_from_mlir_values__(self, values):
+        return PipelineStateSimple(self.stages, Int32(values[0]))
+def make_pipeline_state(type: PipelineUserType, stages: int):
+    """
+    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
+    """
+    if type is PipelineUserType.Producer:
+        # return PipelineStateSimple(stages, Int32(1 << 16))
+        return PipelineStateSimple(stages, Int32(stages))
+    elif type is PipelineUserType.Consumer:
+        return PipelineStateSimple(stages, Int32(0))
+    else:
+        assert False, "Error: invalid PipelineUserType specified for make_pipeline_state."
+@dataclass(frozen=True)
+class PipelineTmaAsync(PipelineTmaAsyncOg):
+    """
+    Override producer_acquire to take in extra_tx_count parameter.
+    """
+    @staticmethod
+    def create(*args, **kwargs):
+        obj = PipelineTmaAsyncOg.create(*args, **kwargs)
+        # Can't assign to __class__ directly since the dataclass is frozen
+        # obj.__class__ = PipelineTmaAsync
+        object.__setattr__(obj, "__class__", PipelineTmaAsync)
+        return obj
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        extra_tx_count: int = 0,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        if const_expr(extra_tx_count == 0):
+            self.sync_object_full.arrive(state.index, self.producer_mask)
+        else:
+            tx_count = self.sync_object_full.tx_count + extra_tx_count
+            self.sync_object_full.arrive_and_expect_tx(state.index, tx_count)
+@dataclass(frozen=True)
+class PipelineTmaUmma(PipelineTmaUmmaOg):
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        mcast_mode_mn: tuple[int, int] = (1, 1),
+        init_wait: cutlass.Constexpr[bool] = True,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        :param mcast_mode_mn: Tuple of two integers, specifying whether mcast is enabled for the m and n modes. At least one of the two integers must be 1.
+        :type mcast_mode_mn: tuple[int, int]
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.TCGen05Mma
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # No mcast mask if not using clusters
+            producer_mask = None
+            # All threadblocks are leaders if not using clusters
+            is_leader_cta = True
+        else:
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(
+                cta_layout_vmnk, mcast_mode_mn
+            )
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+        consumer_mask = producer_mask
+        if const_expr(init_wait):
+            pipeline_init_wait(cta_layout_vmnk)
+        return PipelineTmaUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+            cta_group,
+        )
+    def producer_acquire(
+        self,
+        state: PipelineState,
+        try_acquire_token: Optional[Boolean] = None,
+        extra_tx_count: int = 0,
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        if const_expr(extra_tx_count == 0):
+            if_generate(
+                self.is_leader_cta,
+                lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+            )
+        else:
+            tx_count = self.sync_object_full.tx_count + extra_tx_count
+            if_generate(
+                self.is_leader_cta,
+                lambda: self.sync_object_full.arrive_and_expect_tx(state.index, tx_count),
+            )

mslk/attention/flash_attn/seqlen_info.py ADDED Viewed

@@ -0,0 +1,139 @@
+# @nolint # fbcode
+from typing import Optional
+from dataclasses import dataclass
+import cutlass
+import cutlass.cute as cute
+from cutlass import Int32, const_expr
+"""
+This consolidates all the info related to sequence length. This is so that we can do all
+the gmem reads once at the beginning of each tile, rather than having to repeat these reads
+to compute various things like n_block_min, n_block_max, etc.
+"""
+@dataclass(frozen=True)
+class SeqlenInfo:
+    offset: cutlass.Int32
+    seqlen: cutlass.Int32
+    @staticmethod
+    def create(
+        batch_idx: cutlass.Int32,
+        seqlen_static: cutlass.Int32,
+        cu_seqlens: Optional[cute.Tensor] = None,
+        seqused: Optional[cute.Tensor] = None,
+    ):
+        offset = 0 if const_expr(cu_seqlens is None) else cu_seqlens[batch_idx]
+        if const_expr(seqused is not None):
+            seqlen = seqused[batch_idx]
+        elif const_expr(cu_seqlens is not None):
+            seqlen = cu_seqlens[batch_idx + 1] - cu_seqlens[batch_idx]
+        else:
+            seqlen = seqlen_static
+        return SeqlenInfo(offset, seqlen)
+@dataclass(frozen=True)
+class SeqlenInfoQK:
+    offset_q: cutlass.Int32
+    offset_k: cutlass.Int32
+    padded_offset_q: cutlass.Int32
+    padded_offset_k: cutlass.Int32
+    seqlen_q: cutlass.Int32
+    seqlen_k: cutlass.Int32
+    has_cu_seqlens_q: cutlass.Constexpr[bool]
+    has_cu_seqlens_k: cutlass.Constexpr[bool]
+    has_seqused_q: cutlass.Constexpr[bool]
+    has_seqused_k: cutlass.Constexpr[bool]
+    @staticmethod
+    def create(
+        batch_idx: cutlass.Int32,
+        seqlen_q_static: cutlass.Int32,
+        seqlen_k_static: cutlass.Int32,
+        mCuSeqlensQ: Optional[cute.Tensor] = None,
+        mCuSeqlensK: Optional[cute.Tensor] = None,
+        mSeqUsedQ: Optional[cute.Tensor] = None,
+        mSeqUsedK: Optional[cute.Tensor] = None,
+        tile_m: cutlass.Constexpr[cutlass.Int32] = 128,
+        tile_n: cutlass.Constexpr[cutlass.Int32] = 128,
+    ):
+        offset_q = 0 if const_expr(mCuSeqlensQ is None) else mCuSeqlensQ[batch_idx]
+        offset_k = 0 if const_expr(mCuSeqlensK is None) else mCuSeqlensK[batch_idx]
+        padded_offset_q = (
+            0
+            if const_expr(mCuSeqlensQ is None)
+            else (offset_q + batch_idx * tile_m) // tile_m * tile_m
+        )
+        padded_offset_k = (
+            0
+            if const_expr(mCuSeqlensK is None)
+            else (offset_k + batch_idx * tile_n) // tile_n * tile_n
+        )
+        if const_expr(mSeqUsedQ is not None):
+            seqlen_q = mSeqUsedQ[batch_idx]
+        else:
+            seqlen_q = (
+                seqlen_q_static
+                if const_expr(mCuSeqlensQ is None)
+                else mCuSeqlensQ[batch_idx + 1] - offset_q
+            )
+        if const_expr(mSeqUsedK is not None):
+            seqlen_k = mSeqUsedK[batch_idx]
+        else:
+            seqlen_k = (
+                seqlen_k_static
+                if const_expr(mCuSeqlensK is None)
+                else mCuSeqlensK[batch_idx + 1] - offset_k
+            )
+        has_cu_seqlens_q: int = mCuSeqlensQ is not None
+        has_cu_seqlens_k: int = mCuSeqlensK is not None
+        has_seqused_q: int = mSeqUsedQ is not None
+        has_seqused_k: int = mSeqUsedK is not None
+        return SeqlenInfoQK(
+            offset_q,
+            offset_k,
+            padded_offset_q,
+            padded_offset_k,
+            seqlen_q,
+            seqlen_k,
+            has_cu_seqlens_q,
+            has_cu_seqlens_k,
+            has_seqused_q,
+            has_seqused_k,
+        )
+    def offset_batch_Q(
+        self,
+        mQ: cute.Tensor,
+        batch_idx: Int32,
+        dim: int,
+        padded: cutlass.Constexpr[bool] = False,
+    ) -> cute.Tensor:
+        """Seqlen must be the first dimension of mQ"""
+        if const_expr(not self.has_cu_seqlens_q):
+            idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mQ) - 1 - dim)
+            return mQ[idx]
+        else:
+            offset_q = self.offset_q if const_expr(not padded) else self.padded_offset_q
+            offset = offset_q if const_expr(cute.rank(mQ.shape[0]) == 1) else (0, offset_q)
+            idx = (offset,) + (0,) * (cute.rank(mQ) - 1)
+            return cute.domain_offset(idx, mQ)
+    def offset_batch_K(
+        self,
+        mK: cute.Tensor,
+        batch_idx: Int32,
+        dim: int,
+        padded: cutlass.Constexpr[bool] = False,
+    ) -> cute.Tensor:
+        """Seqlen must be the first dimension of mK"""
+        if const_expr(not self.has_cu_seqlens_k):
+            idx = (None,) * dim + (batch_idx,) + (None,) * (cute.rank(mK) - 1 - dim)
+            return mK[idx]
+        else:
+            offset_k = self.offset_k if const_expr(not padded) else self.padded_offset_k
+            idx = (offset_k,) + (0,) * (cute.rank(mK) - 1)
+            return cute.domain_offset(idx, mK)