PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/gemm/triton/matmul_perf_model.py ADDED Viewed

@@ -0,0 +1,237 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Source: https://github.com/triton-lang/kernels/blob/8821ef322394ee2d3c58a859780ee1e2e10b5c79/kernels/matmul_perf_model.py
+# This file is taken from the upstream triton-lang/kernels repo.
+# Currently that repo does not have a license file, so disabling
+# the license lint for now:
+# @lint-ignore-every LICENSELINT
+# flake8: noqa
+# pyre-ignore-all-errors
+import functools
+import heapq
+import torch
+from triton import cdiv  # @manual
+from triton.runtime import driver  # @manual
+from triton.testing import (  # @manual
+    get_dram_gbps,
+    get_max_simd_tflops,
+    get_max_tensorcore_tflops,
+    nvsmi,
+)
+@functools.lru_cache()
+def get_clock_rate_in_khz():
+    try:
+        return nvsmi(["clocks.max.sm"])[0] * 1e3
+    except FileNotFoundError:
+        import pynvml  # @manual=fbsource//third-party/pypi/pynvml:pynvml
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        return pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM) * 1e3
+def get_tensorcore_tflops(device, num_ctas, num_warps, dtype):
+    """return compute throughput in TOPS"""
+    total_warps = num_ctas * min(num_warps, 4)
+    num_subcores = (
+        driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4
+    )  # on recent GPUs
+    tflops = (
+        min(num_subcores, total_warps)
+        / num_subcores
+        * get_max_tensorcore_tflops(dtype, get_clock_rate_in_khz(), device)
+    )
+    return tflops
+def get_simd_tflops(device, num_ctas, num_warps, dtype):
+    """return compute throughput in TOPS"""
+    total_warps = num_ctas * min(num_warps, 4)
+    num_subcores = (
+        driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4
+    )  # on recent GPUs
+    tflops = (
+        min(num_subcores, total_warps)
+        / num_subcores
+        * get_max_simd_tflops(dtype, get_clock_rate_in_khz(), device)
+    )
+    return tflops
+def get_tflops(device, num_ctas, num_warps, dtype):
+    capability = torch.cuda.get_device_capability(device)
+    if capability[0] < 8 and dtype == torch.float32:
+        return get_simd_tflops(device, num_ctas, num_warps, dtype)
+    return get_tensorcore_tflops(device, num_ctas, num_warps, dtype)
+def estimate_matmul_time(
+    # backend, device,
+    num_warps,
+    num_stages,  #
+    A,
+    B,
+    C,  #
+    M,
+    N,
+    K,  #
+    BLOCK_M,
+    BLOCK_N,
+    BLOCK_K,
+    SPLIT_K,  #
+    debug=False,
+    **kwargs,  #
+):
+    """return estimated running time in ms
+    = max(compute, loading) + store"""
+    device = torch.cuda.current_device()
+    dtype = A.dtype
+    dtsize = A.element_size()
+    num_cta_m = cdiv(M, BLOCK_M)
+    num_cta_n = cdiv(N, BLOCK_N)
+    num_cta_k = SPLIT_K
+    num_ctas = num_cta_m * num_cta_n * num_cta_k
+    # If the input is smaller than the block size
+    M, N = max(M, BLOCK_M), max(N, BLOCK_N)
+    # time to compute
+    total_ops = 2 * M * N * K / (1024 * 1024 * 1024)  # GOPS
+    tput = get_tflops(device, num_ctas, num_warps, dtype)
+    compute_ms = total_ops / tput
+    # time to load data
+    num_sm = driver.active.utils.get_device_properties(device)["multiprocessor_count"]
+    active_cta_ratio = min(1, num_ctas / num_sm)
+    active_cta_ratio_bw1 = min(
+        1, num_ctas / 32
+    )  # 32 active ctas are enough to saturate
+    active_cta_ratio_bw2 = max(
+        min(1, (num_ctas - 32) / (108 - 32)), 0
+    )  # 32-108, remaining 5%
+    dram_bw = get_dram_gbps(device) * (
+        active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05
+    )  # in GB/s
+    l2_bw = dram_bw * 4  # rough estimation (should be 4.7 for A100?)
+    # assume 80% of (following) loads are in L2 cache
+    load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1))
+    load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1)
+    load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1))
+    load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1)
+    # total
+    total_dram = (load_a_dram + load_b_dram) / (1024 * 1024)  # MB
+    total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024)
+    # loading time in ms
+    load_ms = total_dram / dram_bw + total_l2 / l2_bw
+    # estimate storing time
+    store_bw = dram_bw * 0.6  # :o
+    store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024)  # MB
+    if SPLIT_K == 1:
+        store_ms = store_c_dram / store_bw
+    else:
+        reduce_bw = store_bw
+        store_ms = store_c_dram / reduce_bw
+        # c.zero_()
+        zero_ms = M * N * 2 / (1024 * 1024) / store_bw
+        store_ms += zero_ms
+    total_time_ms = max(compute_ms, load_ms) + store_ms
+    if debug:
+        print(
+            f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, "
+            f"loading time: {load_ms}ms, store time: {store_ms}ms, "
+            f"Activate CTAs: {active_cta_ratio * 100}%"
+        )
+    return total_time_ms
+def early_config_prune(configs, named_args, **kwargs):
+    device = torch.cuda.current_device()
+    capability = torch.cuda.get_device_capability()
+    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
+    dtsize = named_args["A"].element_size()
+    dtype = named_args["A"].dtype
+    # 1. make sure we have enough smem
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = (
+            kw["BLOCK_M"],
+            kw["BLOCK_N"],
+            kw["BLOCK_K"],
+            config.num_stages,
+        )
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory <= max_shared_memory:
+            pruned_configs.append(config)
+    configs = pruned_configs
+    # Some dtypes do not allow atomic_add
+    if dtype not in [torch.float16, torch.float32]:
+        configs = [config for config in configs if config.kwargs["SPLIT_K"] == 1]
+    # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps)
+    configs_map = {}
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = (
+            kw["BLOCK_M"],
+            kw["BLOCK_N"],
+            kw["BLOCK_K"],
+            kw["SPLIT_K"],
+            config.num_warps,
+            config.num_stages,
+        )
+        key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps)
+        if key in configs_map:
+            configs_map[key].append((config, num_stages))
+        else:
+            configs_map[key] = [(config, num_stages)]
+    pruned_configs = []
+    for k, v in configs_map.items():
+        BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k
+        if capability[0] >= 8:
+            # compute cycles (only works for ampere GPUs)
+            mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16)
+            mma_cycles = mmas / min(4, num_warps) * 8
+            ldgsts_latency = 300  # Does this matter?
+            optimal_num_stages = ldgsts_latency / mma_cycles
+            # nearest stages, prefer large #stages
+            nearest = heapq.nsmallest(
+                2,
+                v,
+                key=lambda x: (
+                    10 + abs(x[1] - optimal_num_stages)
+                    if (x[1] - optimal_num_stages) < 0
+                    else x[1] - optimal_num_stages
+                ),
+            )
+            for n in nearest:
+                pruned_configs.append(n[0])
+        else:  # Volta & Turing only supports num_stages <= 2
+            random_config = v[0][0]
+            random_config.num_stages = 2
+            pruned_configs.append(random_config)
+    return pruned_configs

mslk/gemm/triton/utils.py ADDED Viewed

@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import sys
+import torch
+import triton  # @manual
+import triton.language as tl  # @manual
+def map_dtype_to_triton(dtype: torch.dtype) -> tl.dtype:
+    """
+    Maps torch dtype to triton dtype.
+    Args:
+        dtype (torch.dtype): input dtype.
+    Returns:
+        tl.dtype: triton dtype.
+    """
+    if dtype == torch.float16:
+        return tl.float16
+    elif dtype == torch.bfloat16:
+        return tl.bfloat16
+    elif dtype == torch.float32:
+        return tl.float32
+    elif dtype == torch.int32:
+        return tl.int32
+    elif dtype == torch.float8_e4m3fn and torch.version.hip is None:
+        return tl.float8e4nv
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
+# check if we have the TMA version in Triton PR #4498 (https://github.com/triton-lang/triton/pull/4498).
+HAS_TMA_DESC = "nv_tma_desc_type" in dir(tl)
+if HAS_TMA_DESC:
+    print(
+        "TMA benchmarks will be running with experimental grid constant TMA descriptor.",
+        file=sys.stderr,
+    )
+else:
+    print(
+        "TMA benchmarks will be running without grid constant TMA descriptor.",
+        file=sys.stderr,
+    )
+class TmaAutoTuneHelper:
+    # duck typing wrapper to implement the same interface as TmaDescKernelParam in Triton PR #4498
+    class KernelParamWrapper:
+        def __init__(self, desc):
+            self.desc = desc
+        def tma_desc_cpu_ptr(self):
+            return self.desc.data_ptr()
+    TMA_SIZE = 128
+    def __init__(self):
+        self.fill_1d_tma_descriptor_inner = (
+            triton.runtime.driver.active.utils.fill_1d_tma_descriptor
+        )
+        self.fill_2d_tma_descriptor_inner = (
+            triton.runtime.driver.active.utils.fill_2d_tma_descriptor
+        )
+        if HAS_TMA_DESC:
+            self.descriptors = {}
+        else:
+            self.cuda_descriptors = {}
+    # Call this method outside of the lambda function for grid size
+    def init_tma_descriptor(self, name):
+        if HAS_TMA_DESC:
+            self.descriptors[name] = torch.empty(
+                TmaAutoTuneHelper.TMA_SIZE, device="cpu", dtype=torch.int8
+            )
+        else:
+            self.cuda_descriptors[name] = torch.empty(
+                TmaAutoTuneHelper.TMA_SIZE, device="cuda", dtype=torch.int8
+            )
+    # Call this method inside the lambda function for grid size
+    def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_size):
+        if HAS_TMA_DESC:
+            desc_x = self.descriptors[name]
+            assert desc_x.data_ptr() % 64 == 0
+            self.fill_1d_tma_descriptor_inner(
+                ptr, dim, block_dim, element_size, desc_x.data_ptr()
+            )
+        else:
+            desc_x = self.cuda_descriptors[name]
+            buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
+            self.fill_1d_tma_descriptor_inner(
+                ptr, dim, block_dim, element_size, buf_x.data_ptr()
+            )
+            desc_x.copy_(buf_x, non_blocking=True)
+    # Call this method inside the lambda function for grid size
+    def fill_2d_tma_descriptor(
+        self, name, ptr, dim1, dim0, block_dim1, block_dim0, element_size
+    ):
+        if HAS_TMA_DESC:
+            desc_x = self.descriptors[name]
+            assert desc_x.data_ptr() % 64 == 0
+            self.fill_2d_tma_descriptor_inner(
+                ptr, dim1, dim0, block_dim1, block_dim0, element_size, desc_x.data_ptr()
+            )
+        else:
+            desc_x = self.cuda_descriptors[name]
+            buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
+            self.fill_2d_tma_descriptor_inner(
+                ptr, dim1, dim0, block_dim1, block_dim0, element_size, buf_x.data_ptr()
+            )
+            desc_x.copy_(buf_x, non_blocking=True)
+    def get_tma_descriptor_kernel_param(self, name):
+        if HAS_TMA_DESC:
+            assert self.descriptors[name] is not None
+            return self.KernelParamWrapper(self.descriptors[name])
+        else:
+            assert self.cuda_descriptors[name] is not None
+            return self.cuda_descriptors[name]

mslk/kv_cache/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from mslk.utils.torch.library import load_library_buck
+load_library_buck("//mslk/csrc/kv_cache:kv_cache_ops")

mslk/moe/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import mslk  # noqa F401
+import torch
+from mslk.utils.torch.library import load_library_buck
+load_library_buck("//mslk/csrc/moe:moe_ops")
+index_shuffling = None
+if torch.cuda.is_available():
+    index_shuffling = torch.ops.mslk.index_shuffling  # noqa F401
+from .activation import silu_mul, silu_mul_quant  # noqa F401
+from .gather_scatter import (  # noqa F401
+    gather_scale_dense_tokens,
+    gather_scale_quant_dense_tokens,
+    scatter_add_dense_tokens,
+    scatter_add_padded_tokens,
+)
+from .shuffling import combine_shuffling, split_shuffling  # noqa F401

mslk/moe/activation.py ADDED Viewed

@@ -0,0 +1,291 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from mslk.utils.triton.fp8_utils import get_fp8_constants
+# Function APIs
+def silu_mul(
+    x0: torch.Tensor,
+    x1: torch.Tensor,
+    valid_token_count: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Fused silu and mul operations.
+    y = x0 * sigmoid(x0) * x1
+    Args:
+        x0: input tensor of shape (T, D)
+        x1: input tensor of shape (T, D)
+        valid_token_count: tensor of shape (1,) to indicate the number of valid tokens.
+    Returns:
+        output tensor of shape (T, D)
+    """
+    assert x0.ndim == 2 and x0.stride(1) == 1
+    assert x1.ndim == 2 and x1.stride(1) == 1
+    assert x0.shape == x1.shape
+    assert x0.dtype == x1.dtype
+    T, D = x0.shape
+    stride_0 = x0.stride(0)
+    stride_1 = x1.stride(0)
+    out = torch.empty((T, D), device="cuda", dtype=x0.dtype)
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    if T >= NUM_SMS:
+        BLOCK_D_OUTER = D
+        BLOCK_D_INNER = 1024
+        assert D % BLOCK_D_INNER == 0
+    else:
+        BLOCK_D_OUTER = 512
+        BLOCK_D_INNER = 256
+        assert D % BLOCK_D_OUTER == 0
+    grid = (T, D // BLOCK_D_OUTER)
+    _mslk_silu_mul[grid](
+        out,
+        x0,
+        x1,
+        stride_0,
+        stride_1,
+        valid_token_count,
+        D,  # pyre-ignore
+        BLOCK_D_OUTER,  # pyre-ignore
+        BLOCK_D_INNER,  # pyre-ignore
+    )
+    return out
+def silu_mul_quant(
+    x0: torch.Tensor,
+    x1: torch.Tensor,
+    scale_ub: Optional[torch.Tensor] = None,
+    valid_token_count: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fused silu, mul, and FP8 rowwise quantization operations.
+    y, y_scale = quantize(x0 * sigmoid(x0) * x1)
+    Args:
+        x0: input tensor of shape (T, D)
+        x1: input tensor of shape (T, D)
+        scale_ub: tensor of shape (1,) to indicate the upper bound of the scale.
+        valid_token_count: tensor of shape (1,) to indicate the number of valid tokens.
+    Returns:
+        output quantized tensor of shape (T, D) and its inverse scale of shape (T,)
+    """
+    assert x0.ndim == 2 and x0.stride(1) == 1
+    assert x1.ndim == 2 and x1.stride(1) == 1
+    assert x0.shape == x1.shape
+    assert x0.dtype == x1.dtype
+    pt_dtype, tl_dtype, max_fp8, eps = get_fp8_constants()
+    T, D = x0.shape
+    stride_0 = x0.stride(0)
+    stride_1 = x1.stride(0)
+    out = torch.empty((T, D), device="cuda", dtype=pt_dtype)
+    out_inv_scale = torch.empty((T,), device="cuda", dtype=torch.float32)
+    if T == 0:
+        return out, out_inv_scale
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    BLOCK_T = triton.cdiv(T, NUM_SMS)
+    NUM_CTAS = triton.cdiv(T, BLOCK_T)
+    grid = (NUM_CTAS,)
+    _mslk_silu_mul_quant[grid](
+        out,
+        out_inv_scale,
+        x0,
+        x1,
+        scale_ub,
+        stride_0,
+        stride_1,
+        valid_token_count,
+        T,
+        D,  # pyre-ignore
+        BLOCK_T,
+        TL_FP8_DTYPE=tl_dtype,  # pyre-ignore
+        MAX_FP8=max_fp8,  # pyre-ignore
+        EPS=eps,  # pyre-ignore
+        CLAMP_MAX=scale_ub is not None,  # pyre-ignore
+    )
+    return out, out_inv_scale
+# Torch Custom Op Registrations
+_SILU_MUL_OP_NAME = "mslk::silu_mul"
+torch.library.define(
+    "mslk::silu_mul",
+    "(Tensor x0, Tensor x1, Tensor? valid_token_count=None) -> Tensor",
+)
+@torch.library.impl(_SILU_MUL_OP_NAME, "Meta")
+def silu_mul_meta(x0, x1, valid_token_count):
+    return x0.new_empty(x0.shape)
+@torch.library.impl(_SILU_MUL_OP_NAME, "CUDA")
+def silu_mul_cuda(x0, x1, valid_token_count):
+    return silu_mul(x0, x1, valid_token_count)
+_SILU_MUL_OP_QUANT_NAME = "mslk::silu_mul_quant"
+torch.library.define(
+    "mslk::silu_mul_quant",
+    "(Tensor x0, Tensor x1, Tensor? scale_ub=None, Tensor? valid_token_count=None) -> (Tensor, Tensor)",
+)
+@torch.library.impl(_SILU_MUL_OP_QUANT_NAME, "Meta")
+def silu_mul_quant_meta(x0, x1, scale_ub, valid_token_count):
+    pt_dtype, tl_dtype, max_fp8, eps = get_fp8_constants()
+    return torch.empty(x0.shape, device=x0.device, dtype=pt_dtype)
+@torch.library.impl(_SILU_MUL_OP_QUANT_NAME, "CUDA")
+def silu_mul_quant_cuda(x0, x1, scale_ub=None, valid_token_count=None):
+    return silu_mul_quant(x0, x1, scale_ub, valid_token_count)
+# Kernel Implementations
+@triton.jit
+def _mslk_silu_mul(
+    y_ptr,
+    x0_ptr,
+    x1_ptr,
+    stride_0,
+    stride_1,
+    valid_token_count,
+    D: tl.constexpr,
+    BLOCK_D_OUTER: tl.constexpr,
+    BLOCK_D_INNER: tl.constexpr,
+) -> None:
+    token_index = tl.program_id(0)
+    feature_offset = tl.program_id(1) * BLOCK_D_OUTER + tl.arange(0, BLOCK_D_INNER)[:]
+    if valid_token_count is not None:
+        valid_token_count = tl.load(
+            valid_token_count, None, eviction_policy="evict_last"
+        )
+        if token_index >= valid_token_count:
+            return
+    for _ in tl.range(0, BLOCK_D_OUTER // BLOCK_D_INNER, num_stages=3):
+        x0 = tl.load(
+            x0_ptr + token_index * stride_0 + feature_offset,
+            None,
+            eviction_policy="evict_first",
+        ).to(tl.float32)
+        x1 = tl.load(
+            x1_ptr + token_index * stride_1 + feature_offset,
+            None,
+            eviction_policy="evict_first",
+        ).to(tl.float32)
+        y = x0 * tl.sigmoid(x0) * x1
+        tl.store(
+            y_ptr + token_index * D + feature_offset,
+            y,
+            None,
+        )
+        feature_offset += BLOCK_D_INNER
+@triton.jit
+def _mslk_silu_mul_quant(
+    y_ptr,
+    y_inv_scale_ptr,
+    x0_ptr,
+    x1_ptr,
+    scale_ub_ptr,
+    stride_0,
+    stride_1,
+    valid_token_count,
+    T,
+    D: tl.constexpr,
+    BLOCK_T: tl.constexpr,
+    TL_FP8_DTYPE: tl.constexpr,
+    MAX_FP8: tl.constexpr,
+    EPS: tl.constexpr,
+    CLAMP_MAX: tl.constexpr,
+) -> None:
+    PADDED_D: tl.constexpr = triton.next_power_of_2(D)  # pyre-ignore
+    tidx = tl.program_id(0)
+    start_idx = tidx * BLOCK_T
+    end_idx = tl.minimum(start_idx + BLOCK_T, T)
+    if valid_token_count is not None:
+        valid_token_count = tl.load(
+            valid_token_count, None, eviction_policy="evict_last"
+        )
+        if start_idx >= valid_token_count:
+            return
+    offsets = tl.arange(0, PADDED_D)[:]
+    mask = offsets < D
+    if CLAMP_MAX:
+        ub = tl.load(scale_ub_ptr, eviction_policy="evict_last")
+    else:
+        ub = float("inf")
+    for token_index in tl.range(start_idx, end_idx, 1, num_stages=2):
+        x0 = tl.load(
+            x0_ptr + token_index * stride_0 + offsets,
+            mask,
+            eviction_policy="evict_first",
+        ).to(tl.float32)
+        x1 = tl.load(
+            x1_ptr + token_index * stride_1 + offsets,
+            mask,
+            eviction_policy="evict_first",
+        ).to(tl.float32)
+        y = x0 * tl.sigmoid(x0) * x1
+        # Masked values are set to 0.0.
+        row_max = tl.max(tl.where(mask, tl.abs(y), 0.0))
+        if CLAMP_MAX:
+            row_max = tl.clamp(row_max, EPS, ub)
+        else:
+            row_max = tl.maximum(row_max, EPS)
+        y_scale = MAX_FP8 / row_max
+        tl.store(y_inv_scale_ptr + token_index, 1.0 / y_scale)
+        y = y * y_scale
+        # Clamp A to fp8 range to make sure there's no overflow.
+        # This is required for AMD. Nvidia's default saturation
+        # handles it, but it's nice to have anyway.
+        y_fp8 = tl.clamp(y, -MAX_FP8, MAX_FP8).to(TL_FP8_DTYPE)
+        tl.store(
+            y_ptr + token_index * D + offsets,
+            y_fp8,
+            mask,
+        )