PyPI - mslk-cuda-nightly - Versions diffs - 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

mslk/__init__.py +56 -0
mslk/attention/__init__.py +7 -0
mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
mslk/attention/flash_attn/__init__.py +22 -0
mslk/attention/flash_attn/ampere_helpers.py +104 -0
mslk/attention/flash_attn/barrier.py +72 -0
mslk/attention/flash_attn/benchmark.py +269 -0
mslk/attention/flash_attn/blackwell_helpers.py +754 -0
mslk/attention/flash_attn/block_info.py +109 -0
mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
mslk/attention/flash_attn/block_sparsity.py +219 -0
mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
mslk/attention/flash_attn/copy_utils.py +341 -0
mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
mslk/attention/flash_attn/fast_math.py +22 -0
mslk/attention/flash_attn/flash_bwd.py +1262 -0
mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
mslk/attention/flash_attn/flash_fwd.py +2471 -0
mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
mslk/attention/flash_attn/hopper_helpers.py +102 -0
mslk/attention/flash_attn/interface.py +1771 -0
mslk/attention/flash_attn/mask.py +610 -0
mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
mslk/attention/flash_attn/named_barrier.py +32 -0
mslk/attention/flash_attn/pack_gqa.py +165 -0
mslk/attention/flash_attn/paged_kv.py +176 -0
mslk/attention/flash_attn/pipeline.py +273 -0
mslk/attention/flash_attn/seqlen_info.py +139 -0
mslk/attention/flash_attn/softmax.py +583 -0
mslk/attention/flash_attn/testing.py +424 -0
mslk/attention/flash_attn/tile_scheduler.py +720 -0
mslk/attention/flash_attn/utils.py +860 -0
mslk/attention/fmha/__init__.py +967 -0
mslk/attention/fmha/_triton/__init__.py +6 -0
mslk/attention/fmha/_triton/available.py +50 -0
mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
mslk/attention/fmha/attn_bias.py +2186 -0
mslk/attention/fmha/attn_bias_utils.py +536 -0
mslk/attention/fmha/ck.py +508 -0
mslk/attention/fmha/ck_decoder.py +141 -0
mslk/attention/fmha/ck_splitk.py +204 -0
mslk/attention/fmha/common.py +598 -0
mslk/attention/fmha/cutlass.py +461 -0
mslk/attention/fmha/cutlass_blackwell.py +560 -0
mslk/attention/fmha/dispatch.py +224 -0
mslk/attention/fmha/flash.py +862 -0
mslk/attention/fmha/flash3.py +858 -0
mslk/attention/fmha/flash_mtia.py +245 -0
mslk/attention/fmha/merge_training.py +192 -0
mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
mslk/attention/fmha/torch_attention_compat.py +154 -0
mslk/attention/fmha/tree_attention.py +718 -0
mslk/attention/fmha/triton_splitk.py +1378 -0
mslk/attention/fmha/unbind.py +130 -0
mslk/attention/fmha/utils/__init__.py +6 -0
mslk/attention/fmha/utils/bench.py +74 -0
mslk/attention/fmha/utils/cpp_lib.py +148 -0
mslk/attention/fmha/utils/op_common.py +65 -0
mslk/attention/gqa_attn_splitk/__init__.py +11 -0
mslk/bench/comm/__init__.py +7 -0
mslk/bench/comm/comm_bench.py +255 -0
mslk/bench/common/__init__.py +5 -0
mslk/bench/common/utils.py +148 -0
mslk/bench/conv/__init__.py +7 -0
mslk/bench/conv/conv_bench.py +551 -0
mslk/bench/conv/conv_ops.py +213 -0
mslk/bench/gemm/__init__.py +7 -0
mslk/bench/gemm/gemm_bench.py +859 -0
mslk/bench/gemm/gemm_ops.py +3342 -0
mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
mslk/bench/moe/__init__.py +7 -0
mslk/bench/moe/gather_scatter_bench.py +356 -0
mslk/bench/quantize/quantize_bench.py +345 -0
mslk/bench/quantize/quantize_ops.py +266 -0
mslk/comm/__init__.py +11 -0
mslk/conv/__init__.py +11 -0
mslk/gemm/__init__.py +18 -0
mslk/gemm/triton/__init__.py +7 -0
mslk/gemm/triton/fp8_gemm.py +2702 -0
mslk/gemm/triton/grouped_gemm.py +1132 -0
mslk/gemm/triton/matmul_perf_model.py +237 -0
mslk/gemm/triton/utils.py +128 -0
mslk/kv_cache/__init__.py +11 -0
mslk/moe/__init__.py +26 -0
mslk/moe/activation.py +291 -0
mslk/moe/gather_scatter.py +739 -0
mslk/moe/layers.py +1240 -0
mslk/moe/shuffling.py +421 -0
mslk/mslk.so +0 -0
mslk/quantize/__init__.py +11 -0
mslk/quantize/shuffle.py +306 -0
mslk/quantize/triton/__init__.py +7 -0
mslk/quantize/triton/fp4_quantize.py +5942 -0
mslk/quantize/triton/fp8_quantize.py +1902 -0
mslk/testing/__init__.py +7 -0
mslk/testing/attributes.py +60 -0
mslk/testing/rocm.py +91 -0
mslk/utils/__init__.py +7 -0
mslk/utils/torch/__init__.py +7 -0
mslk/utils/torch/library.py +150 -0
mslk/utils/triton/__init__.py +7 -0
mslk/utils/triton/fp8_utils.py +72 -0
mslk/utils/triton/utils.py +128 -0
mslk/version.py +11 -0
mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0

mslk/bench/quantize/quantize_bench.py ADDED Viewed

@@ -0,0 +1,345 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import itertools
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Callable, Optional
+import click
+import pandas as pd
+import torch
+import triton  # @manual=//triton:triton
+from mslk.bench.common.utils import BenchOptions, profiler
+from mslk.bench.quantize.quantize_ops import get_ops, QuantizeOpBase
+from tabulate import tabulate
+type ShapeFunction = Callable[[], list[tuple[int, int]]]
+shape_registry: dict[str, ShapeFunction] = {}
+def register_shapes(name: str) -> Callable[[ShapeFunction], ShapeFunction]:
+    def decorator(
+        shape_function: ShapeFunction,
+    ) -> ShapeFunction:
+        shape_registry[name] = shape_function
+        return shape_function
+    return decorator
+@register_shapes("llm_eval")
+def llm_eval() -> list[tuple[int, int]]:
+    return [
+        (1, 5120),
+        (1024, 5120),
+        (2000, 5120),
+        (4096, 5120),
+        (16384, 5120),
+        (1024, 7168),
+        (4096, 4096),
+    ]
+@register_shapes("decode_1024")
+def decode_1024_shapes() -> list[tuple[int, int]]:
+    return [
+        (1, 1024),
+        (1, 2048),
+        (1, 4096),
+        (1, 5120),
+        (1, 6144),
+        (1, 7168),
+        (1, 8192),
+    ]
+@register_shapes("prefill_1024")
+def prefill_1024_shapes() -> list[tuple[int, int]]:
+    shapes = []
+    for M in [2048, 4096, 8192, 16384]:
+        shapes += [
+            (M, 1024),
+            (M, 2048),
+            (M, 4096),
+            (M, 5120),
+            (M, 6144),
+            (M, 7168),
+            (M, 8192),
+        ]
+    return shapes
+@dataclass
+class Metrics:
+    op: str
+    M: int = 0
+    K: int = 0
+    sim: float = 0.0
+    us: float = 0.0
+    gbps: float = 0.0
+    memory_bw_util: float = 0.0
+    @staticmethod
+    def header() -> str:
+        header = f"{'OpName':<20} {'Problem Shape':<15} {'Sim':<10} {'Us':<10} {'GB/s':<10} {'Mem BW Util %':<10}"
+        divider = "-" * len(header)
+        return f"Quantize Bench\n{divider}\n{header}\n{divider}"
+    def __str__(self) -> str:
+        problem_shape = f"({self.M}, {self.K})"
+        return f"{self.op:<20} {problem_shape:<15} {self.sim:<10.3f} {self.us:<10.3f} {self.gbps:<10.2f} {self.memory_bw_util:<10.2f}"
+    def as_dict(self) -> dict[str, float]:
+        return {
+            "M": self.M,
+            "K": self.K,
+            f"{self.op}_sim": self.sim,
+            f"{self.op}_us": self.us,
+            f"{self.op}_gb/s": self.gbps,
+            f"{self.op}_memory_bw_util": self.memory_bw_util,
+        }
+def get_problem_shapes(
+    shapes: Optional[str],
+    m: Optional[str],
+    k: Optional[str],
+    pair_mk: bool,
+) -> list[tuple[int, int]]:
+    if shapes:
+        all_shapes = set()
+        for shape in shapes.strip().split(","):
+            if shape not in shape_registry:
+                print(
+                    f"Shape {shape} not found in shape registry. Valid shapes: {', '.join(shape_registry.keys())}."
+                )
+                sys.exit(1)
+            all_shapes.update(shape_registry[shape]())
+        return list(all_shapes)
+    if m is None:
+        raise Exception("M must be non-empty.")
+    M = [int(m_val) for m_val in m.strip().split(",")]
+    if k is None:
+        raise Exception("K must be non-empty.")
+    K = [int(k_val) for k_val in k.strip().split(",")]
+    if pair_mk:
+        if len(M) != len(K):
+            raise Exception("M and K must be the same length in pair_MK mode.")
+        return list(zip(M, K))
+    else:
+        return list(itertools.product(M, K))
+def benchmark(
+    quantize_ops: list[QuantizeOpBase],
+    m: int,
+    k: int,
+    mem_bw_roofline_gbps: float,
+    opts: BenchOptions,
+) -> list[Metrics]:
+    # Create input tensors.
+    input = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+    # Keep track of results.
+    results = []
+    # Benchmark each operator.
+    for quantize_op in quantize_ops:
+        metrics = Metrics(op=quantize_op.name, M=m, K=k)
+        args = quantize_op.preprocess(input)
+        quantized = quantize_op.quantize(input, *args)
+        dequantized = quantize_op.dequantize(*quantized)
+        metrics.sim = torch.mean(torch.pow(dequantized - input, 2)).item()
+        for _ in range(opts.num_iters):
+            with profiler(enabled=opts.trace, with_stack=True):
+                ms_runtime = quantize_op.benchmark(
+                    input,
+                    args,
+                    opts=opts,
+                )
+            input_bytes = input.numel() * input.element_size()
+            output_bytes = sum(t.numel() * t.element_size() for t in quantized)
+            total_size_bytes = input_bytes + output_bytes
+            gbps = (total_size_bytes / 1e9) / (ms_runtime / 1e3)
+            metrics.gbps += gbps
+            metrics.us += ms_runtime * 1000
+            metrics.memory_bw_util += (gbps / mem_bw_roofline_gbps) * 100
+        metrics.us /= opts.num_iters
+        metrics.gbps /= opts.num_iters
+        metrics.memory_bw_util /= opts.num_iters
+        results.append(metrics)
+    return results
+def collect_kernels_to_profile(kernels: Optional[list[str]]) -> list[QuantizeOpBase]:
+    # Get existing quantization operators.
+    quantize_ops = [op for op in get_ops() if op.supported]
+    if kernels is None:
+        return quantize_ops
+    return [op for op in quantize_ops if op.name in kernels]
+def print_kernels(kernels: Optional[list[str]]) -> None:
+    data = sorted(
+        [
+            (op.name, "Yes" if op.cuda else "No", "Yes" if op.hip else "No")
+            for op in get_ops()
+        ]
+    )
+    print(tabulate(data, headers=["Name", "CUDA", "ROCm"], tablefmt="orgtbl"))
+@click.command()
+@click.option(
+    "--output-dir",
+    default="/tmp",
+    help="Directory to save plots and csvs to",
+)
+@click.option(
+    "--num-iters",
+    default=1,
+    type=int,
+    help="Number of iterations to repeat each benchmark.",
+)
+@click.option(
+    "--export-csv",
+    is_flag=True,
+    help="Export results to a CSV file.",
+)
+@click.option(
+    "--kernels",
+    default=None,
+    help="Comma separated list of kernels to benchmark. Defaults to all kernels.",
+)
+@click.option(
+    "--M",
+    default=None,
+    help="Comma separated list of M values to benchmark.",
+)
+@click.option(
+    "--K",
+    default=None,
+    help="Comma separated list of K values to benchmark.",
+)
+@click.option(
+    "--pair-MK",
+    is_flag=True,
+    help="If set, instead of benchmarking cartesian product of M * K, benchmark consecutive MK pairs together.",
+)
+@click.option(
+    "--no-cuda-graph",
+    is_flag=True,
+    help="If set, do not use cuda graph for benchmarking.",
+)
+@click.option(
+    "--no-rotating-buffer",
+    is_flag=True,
+    help="If set, do not use rotating buffer for benchmarking.",
+)
+@click.option(
+    "--shapes",
+    default=None,
+    help=f"Specific model shapes to use, options: {', '.join(shape_registry.keys())}.",
+)
+@click.option(
+    "--trace",
+    is_flag=True,
+    help="If set, produce a performance trace of the benchmark.",
+)
+def invoke_main(
+    output_dir: str,
+    num_iters: int,
+    export_csv: bool,
+    kernels: Optional[str],
+    m: Optional[str],
+    k: Optional[str],
+    pair_mk: bool,
+    no_cuda_graph: bool,
+    no_rotating_buffer: bool,
+    shapes: Optional[str],
+    trace: bool,
+) -> None:
+    # If kernel filter is provided, parse it. Else, benchmark all kernels.
+    all_kernels = kernels.strip().split(",") if kernels else None
+    quantize_ops = collect_kernels_to_profile(all_kernels)
+    if len(quantize_ops) == 0:
+        print("No valid kernels to benchmark. Available kernels:")
+        print_kernels(all_kernels)
+        sys.exit(1)
+    if num_iters < 1:
+        print("Warning: Number of iterations must be at least 1.")
+        num_iters = 1
+    mem_bw_roofline_gbps = triton.testing.get_dram_gbps()
+    MK = get_problem_shapes(shapes, m, k, pair_mk)
+    opts = BenchOptions(
+        num_iters=num_iters,
+        cuda_graph=not no_cuda_graph,
+        rotating_buffer=not no_rotating_buffer,
+        trace=trace,
+    )
+    # Iterate over shapes and benchmark.
+    benchmark_results = []
+    csv = []
+    for M, K in MK:
+        quantize_measurements = benchmark(
+            quantize_ops,
+            M,
+            K,
+            mem_bw_roofline_gbps,
+            opts,
+        )
+        benchmark_results.extend(quantize_measurements)
+        csv_row = {}
+        for metric in quantize_measurements:
+            csv_row.update(metric.as_dict())
+        csv.append(csv_row)
+    print(Metrics.header())
+    for metric in benchmark_results:
+        print(metric)
+    print("")
+    print(f"Hardware: {torch.cuda.get_device_name()}")
+    print(f"    Memory BW Roofline: {mem_bw_roofline_gbps} GB/s")
+    print("")
+    print("Benchmark Settings:")
+    print(f"    CUDA graph: {opts.cuda_graph}")
+    print(f"    Buffer rotation: {opts.rotating_buffer}")
+    if export_csv:
+        os.makedirs(output_dir, exist_ok=True)
+        datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
+        csv_file = os.path.join(
+            output_dir, f"quantize_ops_benchmark_{datetime_str}.csv"
+        )
+        # Export results to a CSV file.
+        df = pd.DataFrame(csv)
+        df.to_csv(csv_file, na_rep="NaN", index=False)
+        print(f"CSV saved to {csv_file}")
+if __name__ == "__main__":
+    invoke_main()

mslk/bench/quantize/quantize_ops.py ADDED Viewed

@@ -0,0 +1,266 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import abc
+from typing import Any, TypeVar
+import torch
+from mslk.bench.common.utils import BenchOptions, do_bench
+from mslk.quantize.triton.fp4_quantize import triton_quantize_nvfp4
+from mslk.quantize.triton.fp8_quantize import (
+    dequantize_fp8_block,
+    dequantize_fp8_row,
+    triton_quantize_fp8_block,
+    triton_quantize_fp8_group,
+    triton_quantize_fp8_row,
+    triton_quantize_fp8_tensor,
+)
+from mslk.test.quantize.triton.fp4_quantize_test import (
+    dequantize_nvfp4,
+    global_scale_nvfp4,
+)
+class QuantizeOpBase(metaclass=abc.ABCMeta):
+    """Helper abstract class to define expected methods of quantize ops."""
+    @abc.abstractmethod
+    def quantize(self, input: torch.Tensor) -> Any:
+        """Function which quantizes inputs."""
+        pass
+    @abc.abstractmethod
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        """Function which dequantizes inputs. Used for sanity checking."""
+        pass
+    @abc.abstractproperty
+    def hip(self) -> bool:
+        """Whether this operator supports AMD or not."""
+        pass
+    @abc.abstractproperty
+    def cuda(self) -> bool:
+        """Whether this operator supports Nvidia or not."""
+        pass
+    def preprocess(self, input: torch.Tensor) -> Any:
+        """This is used for ops that require additional preprocessing. This method will not be counted in benchmarking."""
+        return ()
+    @property
+    def name(self) -> str:
+        """Name of this operator."""
+        return self.__class__.__name__
+    @property
+    def supported(self) -> bool:
+        """Whether this op will run on the current device."""
+        if torch.version.hip is not None:
+            return self.hip
+        elif torch.version.cuda is not None:
+            return self.cuda
+        else:
+            return False
+    def benchmark(
+        self,
+        input: torch.Tensor,
+        args: Any,
+        opts: BenchOptions,
+    ) -> float:
+        """Benchmark runtime of this operator using do_bench from common."""
+        return do_bench(
+            lambda inp, *a: self.quantize(inp, *a),
+            (input, *args),
+            opts,
+        )
+op_registry: dict[str, QuantizeOpBase] = {}
+T = TypeVar("T", bound=QuantizeOpBase)
+def register_op(op_class: type[T]) -> type[T]:
+    """Decorator function for assembling all quantize ops."""
+    op_registry[op_class.__name__] = op_class()
+    return op_class
+def get_ops() -> list[QuantizeOpBase]:
+    """Get all registered quantize ops."""
+    return list(op_registry.values())
+@register_op
+class TritonFP8Rowwise(QuantizeOpBase):
+    def quantize(self, input: torch.Tensor) -> Any:
+        return triton_quantize_fp8_row(input)
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        input_quantized: torch.Tensor
+        scale: torch.Tensor
+        input_quantized, scale = args
+        return dequantize_fp8_row(input_quantized, scale)
+    @property
+    def hip(self) -> bool:
+        return True
+    @property
+    def cuda(self) -> bool:
+        return True
+@register_op
+class TritonFP8Blockwise(QuantizeOpBase):
+    def __init__(self) -> None:
+        super().__init__()
+        self.block_m = 128
+        self.block_k = 128
+    def quantize(self, input: torch.Tensor) -> Any:
+        return triton_quantize_fp8_block(
+            input, block_m=self.block_m, block_k=self.block_k
+        )
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        input_quantized: torch.Tensor
+        scale: torch.Tensor
+        input_quantized, scale = args
+        return dequantize_fp8_block(input_quantized, scale, self.block_m, self.block_k)
+    @property
+    def hip(self) -> bool:
+        return True
+    @property
+    def cuda(self) -> bool:
+        return True
+@register_op
+class TritonFP8Groupwise(QuantizeOpBase):
+    def __init__(self) -> None:
+        super().__init__()
+        self.group_size = 128
+    def quantize(self, input: torch.Tensor) -> Any:
+        return triton_quantize_fp8_group(input, group_size=self.group_size)
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        input_quantized: torch.Tensor
+        scale: torch.Tensor
+        input_quantized, scale = args
+        input_quantized = input_quantized.to(torch.float)
+        dequantized = input_quantized.view(
+            -1, input_quantized.shape[1] // self.group_size, self.group_size
+        ) * scale.unsqueeze(-1)
+        return dequantized.view(input_quantized.shape)
+    @property
+    def hip(self) -> bool:
+        return True
+    @property
+    def cuda(self) -> bool:
+        return True
+@register_op
+class TritonNVFP4(QuantizeOpBase):
+    def __init__(self) -> None:
+        super().__init__()
+    def preprocess(self, input: torch.Tensor) -> Any:
+        global_scale = global_scale_nvfp4(input)
+        return (global_scale,)
+    def quantize(self, input: torch.Tensor, *args: Any) -> Any:
+        global_scale: torch.Tensor
+        global_scale = args[0]
+        input_quantized, scales = triton_quantize_nvfp4(input, global_scale)
+        return input_quantized.view(torch.uint8), scales, global_scale
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        input_quantized: torch.Tensor
+        scale: torch.Tensor
+        global_scale: torch.Tensor
+        input_quantized, scale, global_scale = args
+        return dequantize_nvfp4(input_quantized, scale, global_scale)
+    @property
+    def hip(self) -> bool:
+        return False
+    @property
+    def cuda(self) -> bool:
+        return True
+@register_op
+class CudaFP8Rowwise(QuantizeOpBase):
+    def quantize(self, input: torch.Tensor) -> Any:
+        return torch.ops.mslk.quantize_fp8_per_row(input)
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        input_quantized: torch.Tensor
+        scale: torch.Tensor
+        input_quantized, scale = args
+        return dequantize_fp8_row(input_quantized, scale)
+    @property
+    def hip(self) -> bool:
+        return True
+    @property
+    def cuda(self) -> bool:
+        return True
+@register_op
+class CudaFP8Tensorwise(QuantizeOpBase):
+    def quantize(self, input: torch.Tensor) -> Any:
+        return torch.ops.mslk.quantize_fp8_per_tensor(input)
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        input_quantized: torch.Tensor
+        scale: torch.Tensor
+        input_quantized, scale = args
+        return input_quantized.to(torch.float32) * scale
+    @property
+    def hip(self) -> bool:
+        return True
+    @property
+    def cuda(self) -> bool:
+        return True
+@register_op
+class TritonFP8Tensorwise(QuantizeOpBase):
+    def quantize(self, input: torch.Tensor) -> Any:
+        return triton_quantize_fp8_tensor(input)
+    def dequantize(self, *args: Any) -> torch.Tensor:
+        input_quantized: torch.Tensor
+        scale: torch.Tensor
+        input_quantized, scale = args
+        return input_quantized.to(torch.float32) * scale
+    @property
+    def hip(self) -> bool:
+        return True
+    @property
+    def cuda(self) -> bool:
+        return True

mslk/comm/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from mslk.utils.torch.library import load_library_buck
+load_library_buck("//mslk/csrc/comm:car_ops")

mslk/conv/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from mslk.utils.torch.library import load_library_buck
+load_library_buck("//mslk/csrc/conv:conv_ops")

mslk/gemm/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from mslk.utils.torch.library import load_library_buck
+load_library_buck("//mslk/csrc/gemm:gemm_ops")
+gemm_ops = [
+    "//mslk/csrc/gemm/cutlass:cutlass_bf16bf16bf16_grouped_grad",
+    "//mslk/csrc/gemm/cutlass:cutlass_bf16bf16bf16_grouped_wgrad",
+]
+for op in gemm_ops:
+    load_library_buck(op)

mslk/gemm/triton/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict