PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/attention/fmha/unbind.py ADDED Viewed

@@ -0,0 +1,130 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from typing import List, Optional, Sequence, Tuple, Union
+import torch
+from .utils.op_common import _get_storage_base
+def get_stack_strides(
+    tensors: Sequence[torch.Tensor], dim: int
+) -> Optional[Tuple[Union[int, torch.SymInt], ...]]:
+    """
+    If the tensors are already stacked on dimension :code:`dim`, \
+        returns the strides of the stacked tensors. \
+        Otherwise returns :code:`None`.
+    """
+    if len(tensors) <= 1 or dim > tensors[0].ndim:
+        return None
+    final_stride = []
+    for i in range(tensors[0].ndim + 1):
+        if i == dim:
+            # PyTorch 2.5 messed up the type annotations for SymInt, but 2.6 will fix it
+            # https://github.com/pytorch/pytorch/issues/138478
+            final_stride.append(
+                tensors[1].storage_offset() - tensors[0].storage_offset()  # type: ignore[operator]
+            )
+            continue
+        if i > dim:
+            i -= 1
+        final_stride.append(tensors[0].stride(i))
+    storage_data_ptr: Optional[int] = None
+    for i, x in enumerate(tensors[1:]):
+        # Sanity checks
+        if x.shape != tensors[0].shape:
+            return None
+        if x.stride() != tensors[0].stride():
+            return None
+        # PyTorch 2.5 messed up the type annotations for SymInt, but 2.6 will fix it
+        # https://github.com/pytorch/pytorch/issues/138478
+        if (
+            x.storage_offset()
+            != tensors[0].storage_offset() + (i + 1) * final_stride[dim]  # type: ignore[operator]
+        ):
+            return None
+        if storage_data_ptr is None:
+            storage_data_ptr = _get_storage_base(tensors[0])
+        # Actual storage check
+        if _get_storage_base(x) != storage_data_ptr:
+            return None
+    return tuple(final_stride)
+def _stack_or_none_fw(
+    tensors: Union[Tuple[torch.Tensor, ...], List[torch.Tensor]],
+    dim: int,
+) -> Optional[torch.Tensor]:
+    strides = get_stack_strides(tensors, dim)
+    if strides is not None:
+        input_shape = list(tensors[0].shape)
+        input_shape.insert(dim, len(tensors))
+        return tensors[0].as_strided(input_shape, strides)
+    return None
+def _stack_fw(
+    tensors: Union[Tuple[torch.Tensor, ...], List[torch.Tensor]],
+    dim: int,
+) -> torch.Tensor:
+    out = _stack_or_none_fw(tensors, dim)
+    if out is None:
+        out = torch.stack(tensors, dim=dim)
+    return out
+class _Unbind(torch.autograd.Function):
+    """
+    See function `unbind`
+    """
+    @staticmethod
+    # type: ignore
+    def forward(ctx, x: torch.Tensor, dim: int):
+        ctx.dim = dim
+        return x.unbind(dim)
+    @classmethod
+    # type: ignore
+    def backward(cls, ctx, *tensors: torch.Tensor):
+        return _stack_fw(tensors, ctx.dim), None
+class _StackOrNone(torch.autograd.Function):
+    """
+    See function `stack_or_none`
+    """
+    @staticmethod
+    # type: ignore
+    def forward(ctx, dim: int, *tensors: torch.Tensor):
+        ctx.dim = dim
+        return _stack_or_none_fw(tensors, dim=dim)
+    @classmethod
+    # type: ignore
+    def backward(cls, ctx, grad: torch.Tensor):
+        return (None, *grad.unbind(dim=ctx.dim))
+def unbind(x: torch.Tensor, dim: int) -> Tuple[torch.Tensor, ...]:
+    """
+    Does exactly the same as :attr:`torch.unbind` for the forward.
+    In backward, avoids a :attr:`torch.cat` if the gradients
+    are already multiple views of the same storage
+    """
+    return _Unbind.apply(x, dim)
+def stack_or_none(tensors: Sequence[torch.Tensor], dim: int) -> torch.Tensor:
+    """
+    Does exactly the same as :attr:`torch.stack` if the tensors can be concatenated
+    without any memory operation. Otherwise returns None.
+    """
+    return _StackOrNone.apply(dim, *tensors)

mslk/attention/fmha/utils/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe

mslk/attention/fmha/utils/bench.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Callable, List, Optional
+import torch
+# from https://github.com/openai/triton/blob/95d9b7f4ae21710dc899e1de6a579b2136ea4f3d/python/triton/testing.py#L19
+def do_bench_cudagraph(
+    fn: Callable, rep: int = 20, grad_to_none: Optional[List[torch.Tensor]] = None
+) -> float:
+    """
+    Benchmark the runtime of the provided function.
+    Args:
+        fn: Function to benchmark
+        rep: Repetition time (in ms)
+        grad_to_none: Reset the gradient of the provided tensor to None
+    Returns:
+        Benchmarked runtime in ms
+    """
+    if torch.cuda.current_stream() == torch.cuda.default_stream():
+        raise RuntimeError(
+            "Cannot capture graph in default stream. "
+            "Please use side stream in benchmark code."
+        )
+    # warmup
+    fn()
+    # step 1 - we estimate the amount of time the kernel call takes
+    # NOTE: this estimate isn't super accurate because the GPU isn't warmed up at this point
+    #       but it is probably good enough
+    if grad_to_none is not None:
+        for x in grad_to_none:
+            x.detach_()
+            x.requires_grad_(True)
+            x.grad = None
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(g):
+        fn()
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    g.replay()
+    end_event.record()
+    torch.cuda.synchronize()
+    estimate_ms = start_event.elapsed_time(end_event)
+    n_repeat = max(1, int(rep / estimate_ms))
+    # step 2 - construct a cuda graph with `n_repeat` unrolled function calls to minimize
+    # host overhead
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(g):
+        for _i in range(n_repeat):
+            if grad_to_none is not None:
+                for x in grad_to_none:
+                    x.grad = None
+            fn()
+    torch.cuda.synchronize()
+    # measure time and return
+    ret = []
+    n_retries = 10
+    for _ in range(n_retries):
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        g.replay()
+        end_event.record()
+        torch.cuda.synchronize()
+        ret += [start_event.elapsed_time(end_event) / n_repeat]
+    return torch.mean(torch.tensor(ret)).item()

mslk/attention/fmha/utils/cpp_lib.py ADDED Viewed

@@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import dataclasses
+import logging
+import os
+import platform
+from pathlib import Path
+from typing import Any, Dict, Optional
+import torch
+logger = logging.getLogger("mslk_fmha")
+UNAVAILABLE_FEATURES_MSG = "  Memory-efficient attention won't be available."
+@dataclasses.dataclass
+class _BuildInfo:
+    metadata: Dict[str, Any]
+    @property
+    def cuda_version(self) -> Optional[int]:
+        return self.metadata["version"]["cuda"]
+    @property
+    def hip_version(self) -> Optional[int]:
+        return self.metadata["version"]["hip"]
+    @property
+    def torch_version(self) -> str:
+        return self.metadata["version"]["torch"]
+    @property
+    def python_version(self) -> str:
+        return self.metadata["version"]["python"]
+    @property
+    def flash_version(self) -> str:
+        return self.metadata["version"].get("flash", "0.0.0")
+    @property
+    def use_torch_flash(self) -> bool:
+        return self.metadata["version"].get("use_torch_flash", False)
+    @property
+    def build_env(self) -> Dict[str, Any]:
+        return self.metadata["env"]
+class xFormersWasNotBuiltException(Exception):
+    def __str__(self) -> str:
+        return (
+            "Need to compile C++ extensions to use all fmha features.\n"
+            "    Please install xformers properly "
+            "(see https://github.com/facebookresearch/xformers#installing-xformers)\n"
+            + UNAVAILABLE_FEATURES_MSG
+        )
+class xFormersInvalidLibException(Exception):
+    def __init__(self, build_info: Optional[_BuildInfo]) -> None:
+        self.build_info = build_info
+    def __str__(self) -> str:
+        if self.build_info is None:
+            msg = "fmha was built for a different version of PyTorch or Python."
+        else:
+            msg = f"""fmha was built for:
+    PyTorch {self.build_info.torch_version} with CUDA {self.build_info.cuda_version} (you have {torch.__version__})
+    Python  {self.build_info.python_version} (you have {platform.python_version()})"""
+        return (
+            "fmha can't load C++/CUDA extensions. "
+            + msg
+            + "\n  Please reinstall mslk "
+            + UNAVAILABLE_FEATURES_MSG
+        )
+def _register_extensions():
+    import importlib
+    import os
+    import torch
+    # load the custom_op_library from the mslk directory
+    # and register the custom ops
+    lib_dir = str(Path(__file__).parent.parent.parent.parent)
+    if os.name == "nt":
+        # Register the main torchvision library location on the default DLL path
+        import ctypes
+        import sys
+        kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+        with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
+        prev_error_mode = kernel32.SetErrorMode(0x0001)
+        if with_load_library_flags:
+            kernel32.AddDllDirectory.restype = ctypes.c_void_p
+        if sys.version_info >= (3, 8):
+            os.add_dll_directory(lib_dir)
+        elif with_load_library_flags:
+            res = kernel32.AddDllDirectory(lib_dir)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
+                raise err
+        kernel32.SetErrorMode(prev_error_mode)
+    loader_details = (
+        importlib.machinery.ExtensionFileLoader,
+        importlib.machinery.EXTENSION_SUFFIXES,
+    )
+    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    if torch.version.hip and not hasattr(torch.version, "git_version"):
+        ext_specs = extfinder.find_spec("_C_hip")
+    else:
+        ext_specs = extfinder.find_spec("_C")
+    if ext_specs is None:
+        raise xFormersWasNotBuiltException()
+    try:
+        torch.ops.load_library(ext_specs.origin)
+    except OSError as exc:
+        raise xFormersInvalidLibException(None) from exc
+_cpp_library_load_exception = None
+try:
+    _register_extensions()
+except (xFormersInvalidLibException, xFormersWasNotBuiltException) as e:
+    ENV_VAR_FOR_DETAILS = "XFORMERS_MORE_DETAILS"
+    if os.environ.get(ENV_VAR_FOR_DETAILS, False):
+        logger.warning(f"WARNING[XFORMERS]: {e}", exc_info=e)
+    else:
+        logger.warning(
+            f"WARNING[XFORMERS]: {e}\n  Set {ENV_VAR_FOR_DETAILS}=1 for more details"
+        )
+    _cpp_library_load_exception = e
+_built_with_cuda = True  # XXXXX

mslk/attention/fmha/utils/op_common.py ADDED Viewed

@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Any, Dict, List, Type, TypeVar
+import torch
+def get_operator(library: str, name: str):
+    def no_such_operator(*args, **kwargs):
+        raise RuntimeError(
+            f"No such operator {library}::{name} - did you forget to build xformers with `python setup.py develop`?"
+        )
+    try:
+        return getattr(getattr(torch.ops, library), name)
+    except (RuntimeError, AttributeError):
+        return no_such_operator
+def get_xformers_operator(name: str):
+    return get_operator("xformers", name)
+class BaseOperator:
+    OPERATOR: Any  # pyre-ignore[13]
+    NAME: str  # pyre-ignore[13]
+    OPERATOR_CATEGORY: str  # pyre-ignore[13]
+    @classmethod
+    def is_available(cls) -> bool:
+        # cls.OPERATOR can be either a kernel or a Triton Autotuner object, which doesn't have __name__
+        if (
+            cls.OPERATOR is None
+            or getattr(cls.OPERATOR, "__name__", "") == "no_such_operator"
+        ):
+            return False
+        return True
+OPERATORS_REGISTRY: List[Type[BaseOperator]] = []
+FUNC_TO_XFORMERS_OPERATOR: Dict[Any, Type[BaseOperator]] = {}
+ClsT = TypeVar("ClsT")
+def register_operator(cls: ClsT) -> ClsT:
+    OPERATORS_REGISTRY.append(cls)  # type: ignore
+    FUNC_TO_XFORMERS_OPERATOR[cls.OPERATOR] = cls  # type: ignore
+    return cls
+# post-2.0, avoids a warning
+# (`torch.Tensor.storage` will also be deleted in the future)
+_GET_TENSOR_STORAGE = getattr(torch.Tensor, "untyped_storage", None)
+if _GET_TENSOR_STORAGE is None:  # pre-2.0, `untyped_storage` didn't exist
+    _GET_TENSOR_STORAGE = torch.Tensor.storage
+def _get_storage_base(x: torch.Tensor) -> int:
+    return _GET_TENSOR_STORAGE(x).data_ptr()  # type: ignore

mslk/attention/gqa_attn_splitk/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from mslk.utils.torch.library import load_library_buck
+load_library_buck("//mslk/csrc/attention/cuda/gqa_attn_splitk:gqa_attn_splitk_ops_gpu")

mslk/bench/comm/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict

mslk/bench/comm/comm_bench.py ADDED Viewed

@@ -0,0 +1,255 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import tempfile
+import uuid
+from functools import lru_cache
+from pprint import pprint
+import mslk.comm  # noqa: F401
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.distributed.launcher.api import elastic_launch, LaunchConfig
+@lru_cache(None)
+def get_symm_buffer(group):
+    inp = symm_mem.empty(
+        16 * 1024 * 1024, device="cuda", dtype=torch.bfloat16
+    )  # .normal_()
+    symm_mem.rendezvous(inp, group=group)
+    return inp, group.group_name
+def _setup(path: str) -> tuple[int, int]:
+    rank = int(os.environ["LOCAL_RANK"])
+    W = int(os.environ["WORLD_SIZE"])
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    torch.ops.mslk.nccl_init(rank, W, os.path.join(path, "rdvz"))
+    torch.distributed.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        init_method=f"file://{os.path.join(path, 'gloo_rdvz')}",
+        world_size=W,
+        rank=rank,
+    )
+    buffer = torch.ops.mslk.car_tensor()
+    barrier = torch.ops.mslk.car_tensor()
+    barrier.zero_()
+    buffer_handle = torch.ops.mslk.car_ipc_handle(buffer)
+    all_buffer_handles = [torch.empty_like(buffer_handle) for _ in range(W)]
+    torch.distributed.all_gather(all_buffer_handles, buffer_handle)
+    barrier_handle = torch.ops.mslk.car_ipc_handle(barrier)
+    all_barrier_handles = [torch.empty_like(barrier_handle) for _ in range(W)]
+    torch.distributed.all_gather(all_barrier_handles, barrier_handle)
+    torch.ops.mslk.car_init(
+        rank, W, barrier, all_barrier_handles, buffer, all_buffer_handles
+    )
+    torch.cuda.synchronize()
+    torch.distributed.barrier()
+    group = dist.group.WORLD
+    _ = get_symm_buffer(group)
+    return rank, W
+def symm_one_shot_allreduce(dst_tensor, src_tensor, bias=None, comm_idx=None):
+    # get_symm_buffer should be called for the first time during model init,
+    # and now return cached values. Make sure group is the same as during init
+    symm_buffer, group_name = get_symm_buffer(dist.group.WORLD)
+    symm_buffer = symm_buffer[: src_tensor.numel()].view_as(src_tensor)
+    torch.ops.symm_mem.one_shot_all_reduce_copy_out(
+        symm_buffer, src_tensor, "sum", group_name, dst_tensor
+    )
+    if bias is not None:
+        dst_tensor.add_(bias)
+def symm_two_shot_allreduce(dst_tensor, src_tensor, bias=None, comm_idx=None):
+    # get_symm_buffer should be called for the first time during model init,
+    # and now return cached values. Make sure group is the same as during init
+    symm_buffer, group_name = get_symm_buffer(dist.group.WORLD)
+    # car is also doing explicit copy
+    symm_buffer = symm_buffer[: src_tensor.numel()].view_as(src_tensor)
+    symm_buffer.copy_(src_tensor)
+    torch.ops.symm_mem.two_shot_all_reduce_out(
+        symm_buffer, "sum", group_name, dst_tensor
+    )
+    if bias is not None:
+        dst_tensor.add_(bias)
+def symm_reduce_scatter(dst_tensor, src_tensor, comm_idx=None):
+    symm_buffer, group_name = get_symm_buffer(dist.group.WORLD)
+    symm_buffer = symm_buffer[: src_tensor.numel()].view_as(src_tensor)
+    symm_buffer.copy_(src_tensor)
+    torch.ops.symm_mem.reduce_scatter_out(symm_buffer, group_name, False, dst_tensor)
+def run_one_algo(fn, out, inp, num_iters, num_warmup_iters):
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    for _ in range(num_warmup_iters):
+        fn(out, inp)
+    start_event.record()
+    for _ in range(num_iters):
+        fn(out, inp)
+    end_event.record()
+    torch.cuda.synchronize()
+    time = start_event.elapsed_time(end_event) / num_iters
+    return time
+def run_benchmark(args, path):
+    rank, W = _setup(path)
+    if rank == 0:
+        print(f"Running benchmark with {W} ranks")
+    # benchmark_results = defaultdict(defaultdict)
+    benchmark_results = []
+    # with torch.profiler.profile() as p:
+    for N in torch.logspace(
+        args.min_size, args.max_size, steps=args.size_steps, base=2
+    ).tolist():
+        def round_up(a: int, b: int) -> int:
+            return ((a + b - 1) // b) * b
+        N_even_divisor = 8 * 64 if torch.version.hip else 8 * 32
+        N = round_up(int(N), N_even_divisor)
+        inp = torch.rand(N, dtype=torch.bfloat16, device="cuda")
+        results = {"N": N}
+        if args.op == "allreduce":
+            out = torch.full_like(inp, -1)
+            fns = (
+                torch.ops.mslk.one_shot_car_allreduce,
+                symm_one_shot_allreduce,
+                torch.ops.mslk.two_shot_car_allreduce,
+                symm_two_shot_allreduce,
+                torch.ops.mslk.nccl_allreduce,
+            )
+            labels = (
+                "mslk_1shot",
+                "symm_1shot",
+                "mslk_2shot",
+                "symm_2shot",
+                "nccl",
+            )
+            for fn, label in zip(fns, labels):
+                time = run_one_algo(
+                    fn,
+                    out,
+                    inp,
+                    args.num_iters,
+                    args.num_warmup_iters,
+                )
+                results[f"{label}_time"] = time
+                results[f"{label}_bwidth"] = (
+                    N * inp.element_size() / (time * 1e-3) / 1e9
+                )
+        else:
+            out = torch.full(
+                (inp.shape[0] // W,), -1, dtype=inp.dtype, device=inp.device
+            )
+            fns = (
+                torch.ops.mslk.car_reducescatter,
+                symm_reduce_scatter,
+                torch.ops.mslk.nccl_reducescatter,
+            )
+            labels = ("mslk_rs", "symm_rs", "nccl_rs")
+            for fn, label in zip(fns, labels):
+                time = run_one_algo(
+                    fn,
+                    out,
+                    inp,
+                    args.num_iters,
+                    args.num_warmup_iters,
+                )
+                results[f"{label}_time"] = time
+                results[f"{label}_bwidth"] = (
+                    N * inp.element_size() / (time * 1e-3) / 1e9
+                )
+        benchmark_results.append(results)
+    if rank == 0:
+        pprint(benchmark_results)
+        if args.export_csv:
+            csv_file = os.path.join(args.output_dir, "comm_ops_benchmark.csv")
+            # Export results to a CSV file.
+            df = pd.DataFrame(benchmark_results)
+            df.to_csv(csv_file, index=False)
+def main(args, path):
+    if args.export_csv:
+        os.makedirs(args.output_dir, exist_ok=True)
+        print("csv and images will be saved to " + args.output_dir)
+    lc = LaunchConfig(
+        min_nodes=1,
+        max_nodes=1,
+        nproc_per_node=args.num_ranks,
+        run_id=str(uuid.uuid4()),
+        rdzv_backend="c10d",
+        rdzv_endpoint="localhost:0",
+        max_restarts=0,
+        monitor_interval=1,
+    )
+    elastic_launch(lc, entrypoint=run_benchmark)(args, path)
+def invoke_main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_dir", default="/tmp", help="Directory to save plots and csvs to"
+    )
+    parser.add_argument(
+        "--export_csv",
+        action="store_true",
+        help="Export results to a CSV file.",
+    )
+    parser.add_argument("--num_ranks", type=int, default=8)
+    parser.add_argument("--num_iters", type=int, default=20)
+    parser.add_argument("--num_warmup_iters", type=int, default=10)
+    parser.add_argument(
+        "--min_size",
+        type=int,
+        default=10,
+        help="minimum size will be set to 2**min_size",
+    )
+    parser.add_argument(
+        "--max_size",
+        type=int,
+        default=24,
+        help="maximum size will be set to 2**max_size",
+    )
+    parser.add_argument(
+        "--size_steps", type=int, default=20, help="number of size steps to run"
+    )
+    parser.add_argument(
+        "--op",
+        type=str,
+        default="allreduce",
+        choices=["allreduce", "reduce_scatter"],
+        help="op to benchmark, allreduce or reduce_scatter",
+    )
+    args = parser.parse_args()
+    with tempfile.TemporaryDirectory() as path:
+        main(args, path)
+if __name__ == "__main__":
+    invoke_main()