PyPI - humming-kernels - Versions diffs - 0.1.0__py3-none-any.whl - Mend

humming-kernels 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

humming/__init__.py +2 -0
humming/config/__init__.py +13 -0
humming/config/base.py +129 -0
humming/config/config.py +196 -0
humming/config/enum.py +21 -0
humming/config/mma.py +301 -0
humming/csrc/launcher/elf.h +125 -0
humming/csrc/launcher/launcher.cpp +262 -0
humming/csrc/launcher/tensor.h +288 -0
humming/csrc/launcher/tma.h +77 -0
humming/csrc/launcher/torch_api.h +63 -0
humming/csrc/launcher/utils.h +105 -0
humming/dtypes.py +205 -0
humming/include/humming/arith/epilogue_arith.cuh +182 -0
humming/include/humming/arith/exp_offset.cuh +126 -0
humming/include/humming/arith/mainloop_arith.cuh +511 -0
humming/include/humming/datatype/base_conversion.cuh +125 -0
humming/include/humming/datatype/dequant.cuh +110 -0
humming/include/humming/datatype/dequant_fused.cuh +90 -0
humming/include/humming/datatype/dequant_prepare.cuh +68 -0
humming/include/humming/datatype/dequant_single.cuh +154 -0
humming/include/humming/datatype/dtypes.cuh +70 -0
humming/include/humming/epilogue/gmem_writer.cuh +171 -0
humming/include/humming/epilogue/pipeline.cuh +112 -0
humming/include/humming/epilogue/smem_reducer.cuh +92 -0
humming/include/humming/epilogue/smem_writer.cuh +212 -0
humming/include/humming/kernel/dequant_weight.cuh +47 -0
humming/include/humming/kernel/humming.cuh +156 -0
humming/include/humming/kernel/humming_ws.cuh +191 -0
humming/include/humming/kernel/pack_weight.cuh +95 -0
humming/include/humming/kernel/process.cuh +263 -0
humming/include/humming/kernel/process_mxfp4.cuh +69 -0
humming/include/humming/kernel/quant_weight.cuh +277 -0
humming/include/humming/kernel/tops_bench.cuh +50 -0
humming/include/humming/memory/g2s_loader/loader_a.cuh +183 -0
humming/include/humming/memory/g2s_loader/loader_as.cuh +131 -0
humming/include/humming/memory/g2s_loader/loader_b.cuh +83 -0
humming/include/humming/memory/g2s_loader/loader_bias.cuh +57 -0
humming/include/humming/memory/g2s_loader/loader_bs.cuh +114 -0
humming/include/humming/memory/g2s_loader/loader_bzp.cuh +91 -0
humming/include/humming/memory/g2s_pipeline.cuh +343 -0
humming/include/humming/memory/s2r_loader/loader_a.cuh +61 -0
humming/include/humming/memory/s2r_loader/loader_as.cuh +65 -0
humming/include/humming/memory/s2r_loader/loader_b.cuh +57 -0
humming/include/humming/memory/s2r_loader/loader_bias.cuh +51 -0
humming/include/humming/memory/s2r_loader/loader_bs.cuh +104 -0
humming/include/humming/memory/s2r_loader/loader_bzp.cuh +64 -0
humming/include/humming/memory/s2r_pipeline.cuh +79 -0
humming/include/humming/mma/wgmma.cuh +175 -0
humming/include/humming/mma/wmma.cuh +124 -0
humming/include/humming/scheduler.cuh +335 -0
humming/include/humming/utils/all.cuh +13 -0
humming/include/humming/utils/base.cuh +71 -0
humming/include/humming/utils/enum.cuh +24 -0
humming/include/humming/utils/ptx/barrier.cuh +122 -0
humming/include/humming/utils/ptx/legacy_load.cuh +180 -0
humming/include/humming/utils/ptx/math.cuh +20 -0
humming/include/humming/utils/ptx/shared.cuh +45 -0
humming/include/humming/utils/ptx/tma.cuh +139 -0
humming/include/humming/utils/ptx/warp.cuh +17 -0
humming/include/humming/utils/ptx/wgmma.cuh +24 -0
humming/include/humming/utils/storage.cuh +163 -0
humming/jit/__init__.py +3 -0
humming/jit/compiler.py +278 -0
humming/jit/runtime.py +136 -0
humming/kernel/__init__.py +17 -0
humming/kernel/dequant_weight.py +65 -0
humming/kernel/humming.py +404 -0
humming/kernel/pack_weight.py +42 -0
humming/kernel/process_mxfp4.py +64 -0
humming/kernel/quant_weight.py +81 -0
humming/kernel/repack_weight.py +103 -0
humming/kernel/tops_bench.py +86 -0
humming/kernel/unpack_weight.py +43 -0
humming/layer.py +821 -0
humming/ops/__init__.py +144 -0
humming/ops/bench.py +43 -0
humming/ops/input.py +229 -0
humming/ops/moe.py +188 -0
humming/ops/utils.py +165 -0
humming/ops/weight.py +212 -0
humming/schema/__init__.py +43 -0
humming/schema/awq.py +118 -0
humming/schema/base.py +316 -0
humming/schema/bitnet.py +108 -0
humming/schema/compressed_tensors.py +330 -0
humming/schema/fp8.py +144 -0
humming/schema/gpt_oss_mxfp4.py +43 -0
humming/schema/gptq.py +110 -0
humming/schema/humming.py +276 -0
humming/schema/modelopt.py +253 -0
humming/schema/mxfp4.py +75 -0
humming/tune/__init__.py +80 -0
humming/tune/base.py +271 -0
humming/tune/sm100.py +9 -0
humming/tune/sm75.py +53 -0
humming/tune/sm8x.py +110 -0
humming/tune/sm90.py +167 -0
humming/tune/sm90_h20.py +198 -0
humming/utils/__init__.py +0 -0
humming/utils/cuda.py +220 -0
humming/utils/device.py +84 -0
humming/utils/jit.py +190 -0
humming/utils/smem.py +91 -0
humming/utils/test.py +386 -0
humming/utils/weight.py +302 -0
humming_kernels-0.1.0.dist-info/METADATA +111 -0
humming_kernels-0.1.0.dist-info/RECORD +111 -0
humming_kernels-0.1.0.dist-info/WHEEL +5 -0
humming_kernels-0.1.0.dist-info/licenses/LICENSE +202 -0
humming_kernels-0.1.0.dist-info/top_level.txt +1 -0

humming/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import humming.ops # noqa
2	+ import humming.dtypes # noqa

humming/config/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from humming.config.config import ComputeConfig, LayerConfig, TuningConfig
+from humming.config.enum import GemmType, MmaType, WeightScaleType
+from humming.config.mma import MmaOpClass
+__all__ = [
+    "LayerConfig",
+    "ComputeConfig",
+    "TuningConfig",
+    "MmaType",
+    "WeightScaleType",
+    "GemmType",
+    "MmaOpClass",
+]

humming/config/base.py ADDED Viewed

@@ -0,0 +1,129 @@
+import dataclasses
+import json
+import re
+from enum import Enum
+from typing import Any, ClassVar
+from humming import dtypes
+def name_to_google_cpp_const_style(name: str) -> str:
+    if not name:
+        return ""
+    name = name.strip().lower()
+    words = re.split(r"[_ \W]+", name)
+    pascal_words = [word.capitalize() for word in words if word]
+    return "k" + "".join(pascal_words)
+def name_value_to_google_cpp_const_style(name: str, value: Any, keep_name: bool = False) -> str:
+    if not keep_name:
+        name = name_to_google_cpp_const_style(name)
+    if isinstance(value, bool):
+        value = "true" if value else "false"
+    elif isinstance(value, float):
+        value = str(value) + "f"
+    elif isinstance(value, int):
+        value = str(value) + "u"
+    else:
+        value = str(value).replace(".", "::")
+    return f"static constexpr auto {name} = {value};"
+def name_value_to_extern_const_style(name: str, value: Any) -> str:
+    name = name.upper()
+    if isinstance(value, (bool, int)):
+        value = int(value)
+        return f'extern "C" __constant__ uint32_t {name} = {value};'
+    return ""
+def name_value_to_macro_style(name: str, value: Any) -> str:
+    name = name.upper()
+    if isinstance(value, (bool, int)):
+        value = int(value)
+        return f"#define HUMMING_{name.upper()} {int(value)}"
+    return ""
+@dataclasses.dataclass
+class BaseHummingConfig:
+    _name_map: ClassVar[dict[str, str]] = {}
+    _cpp_extra_names: ClassVar[tuple[str, ...]] = ()
+    def __post_init__(self):
+        pass
+    def to_cpp_str(
+        self,
+        cls: type["BaseHummingConfig"] | None = None,
+        include_class_name: bool = False,
+    ) -> str:
+        cls = cls or self.__class__
+        str_list = []
+        names = [x.name for x in dataclasses.fields(cls)]
+        names += list(cls._cpp_extra_names)
+        for name in names:
+            value = getattr(self, name)
+            if not isinstance(value, (bool, int, Enum)):
+                continue
+            keep_name = name in cls._name_map
+            if keep_name:
+                name = cls._name_map[name]
+            line = name_value_to_google_cpp_const_style(name, value, keep_name)
+            str_list.append(line)
+        code = "\n".join("  " + x for x in str_list)
+        class_name = cls.__name__
+        if include_class_name:
+            code = f"class {class_name} {{\n{code}\n}};"
+        return code
+    def to_macro_cpp_str(self, cls: type["BaseHummingConfig"] | None = None) -> str:
+        cls = cls or self.__class__
+        str_list = []
+        names = [x.name for x in dataclasses.fields(cls)]
+        names += list(cls._cpp_extra_names)
+        for name in names:
+            value = getattr(self, name)
+            if not isinstance(value, (bool, int, Enum)):
+                continue
+            line = name_value_to_macro_style(name, value)
+            str_list.append(line)
+        str_list = [x for x in str_list if x]
+        code = "\n".join(x for x in str_list if x)
+        return code
+    def to_extern_cpp_str(self, cls: type["BaseHummingConfig"] | None = None) -> str:
+        cls = cls or self.__class__
+        str_list = []
+        names = [x.name for x in dataclasses.fields(cls)]
+        names += list(cls._cpp_extra_names)
+        for name in names:
+            value = getattr(self, name)
+            if not isinstance(value, (bool, int, Enum)):
+                continue
+            line = name_value_to_extern_const_style(name, value)
+            str_list.append(line)
+        str_list = [x for x in str_list if x]
+        code = "\n".join(x for x in str_list if x)
+        return code
+    def to_str(self) -> str:
+        res = {}
+        for field in dataclasses.fields(self):
+            value = getattr(self, field.name)
+            if isinstance(value, Enum):
+                value = value.value
+            elif isinstance(value, dtypes.DataType):
+                value = str(value)
+            res[field.name] = value
+        return json.dumps(res)

humming/config/config.py ADDED Viewed

@@ -0,0 +1,196 @@
+import dataclasses
+import math
+from typing import ClassVar
+import torch
+from humming import dtypes
+from humming.config.base import BaseHummingConfig
+from humming.config.enum import GemmType, MmaType, WeightScaleType
+@dataclasses.dataclass(kw_only=True)
+class LayerConfig(BaseHummingConfig):
+    # shape config
+    shape_n: int
+    shape_k: int
+    pad_shape_n: int = 0
+    pad_shape_k: int = 0
+    num_experts: int = 0
+    # datatype config
+    b_dtype: dtypes.DataType
+    a_dtype: dtypes.DataType
+    c_dtype: dtypes.DataType
+    bs_dtype: dtypes.DataType | None = None
+    # quant param config
+    input_scale_group_size: int = 0
+    weight_scale_group_size: int = 0
+    weight_scale_group_size_n: int = 0
+    weight_scale_type: WeightScaleType | None = None
+    use_int_weight_scale: bool = False
+    use_fused_e8m0_scale: bool = False
+    has_zero_point: bool = False
+    is_fp_zero_point: bool = False
+    # bias config
+    has_bias: bool = False
+    # mma config
+    mma_type: MmaType | None = None
+    _cpp_extra_names: ClassVar[tuple[str, ...]] = (
+        "is_channel_weight_scale",
+        "is_block_weight_scale",
+        "is_group_weight_scale",
+        "is_tensor_weight_scale",
+        "has_input_scale",
+    )
+    def __post_init__(self):
+        self.problem_shape = (0, self.shape_n, self.shape_k)
+        self.pad_shape = (0, self.pad_shape_n, self.pad_shape_k)
+        if self.bs_dtype is None:
+            self.bs_dtype = self.c_dtype
+        if self.weight_scale_type is None:
+            if self.weight_scale_group_size_n > 1:
+                self.weight_scale_type = WeightScaleType.BLOCK
+            elif self.weight_scale_group_size == 0:
+                self.weight_scale_type = WeightScaleType.CHANNEL
+            elif self.weight_scale_group_size > 0:
+                self.weight_scale_type = WeightScaleType.GROUP
+        if isinstance(self.weight_scale_type, str):
+            self.weight_scale_type = WeightScaleType(self.weight_scale_type)
+        if self.weight_scale_type is None:
+            if self.weight_scale_group_size == 0:
+                self.weight_scale_type = WeightScaleType.CHANNEL
+            elif self.weight_scale_group_size > 0 and self.weight_scale_group_size_n > 1:
+                self.weight_scale_type = WeightScaleType.BLOCK
+            elif self.weight_scale_group_size > 0:
+                self.weight_scale_type = WeightScaleType.GROUP
+        if self.mma_type is None:
+            sm_version = torch.cuda.get_device_capability()[0]
+            self.mma_type = MmaType.WGMMA if sm_version == 9 else MmaType.MMA
+        if isinstance(self.mma_type, str):
+            self.mma_type = MmaType(self.mma_type)
+        for name in ["a", "b", "c", "bs"]:
+            value = getattr(self, f"{name}_dtype")
+            if isinstance(value, str):
+                value = dtypes.DataType.from_str(value)
+            setattr(self, f"{name}_dtype", value)
+        self.has_input_scale = self.a_dtype.num_bits != 16
+        self.is_channel_weight_scale = self.weight_scale_type == WeightScaleType.CHANNEL
+        self.is_tensor_weight_scale = self.weight_scale_type in [
+            WeightScaleType.TENSOR,
+            WeightScaleType.GROUP_TENSOR,
+        ]
+        self.is_block_weight_scale = self.weight_scale_type == WeightScaleType.BLOCK
+        self.is_group_weight_scale = self.weight_scale_type in [
+            WeightScaleType.GROUP,
+            WeightScaleType.GROUP_TENSOR,
+        ]
+@dataclasses.dataclass(kw_only=True)
+class ComputeConfig(BaseHummingConfig):
+    use_f16_accum: bool = False
+    use_batch_invariant: bool = False
+    gemm_type: GemmType | None = None
+    _cpp_extra_names: ClassVar[tuple[str, ...]] = (
+        "gemm_type_id",
+        "is_indexed_gemm",
+        "is_grouped_gemm",
+        "is_grouped_contiguous_gemm",
+        "is_grouped_masked_gemm",
+    )
+    def __post_init__(self):
+        if isinstance(self.gemm_type, str):
+            self.gemm_type = GemmType(self.gemm_type)
+        self.is_indexed_gemm = self.gemm_type == GemmType.INDEXED
+        self.is_grouped_contiguous_gemm = self.gemm_type == GemmType.GROUPED_CONTIGUOUS
+        self.is_grouped_masked_gemm = self.gemm_type == GemmType.GROUPED_MASKED
+        self.is_grouped_gemm = self.is_grouped_contiguous_gemm or self.is_grouped_masked_gemm
+    @property
+    def gemm_type_id(self):
+        assert self.gemm_type is not None
+        value = self.gemm_type.value.lower()
+        return ["dense", "indexed", "grouped_contiguous", "grouped_masked"].index(value)
+@dataclasses.dataclass(kw_only=True)
+class TuningConfig(BaseHummingConfig):
+    block_shape: tuple[int, int, int]
+    warp_shape: tuple[int, int, int]
+    use_stream_k: bool = True
+    num_stages: int = 2
+    num_ctas_per_sm: int = 1
+    use_warp_spec: bool | None = None
+    use_mbarrier: bool | None = None
+    use_cp_async: bool | None = None
+    use_tma: bool | None = None
+    use_tma_a: bool | None = None
+    use_tma_b: bool | None = None
+    use_tma_c: bool | None = None
+    use_tma_bs: bool | None = None
+    use_tma_bzp: bool | None = None
+    use_tma_bias: bool | None = None
+    num_write_splits: int = 1
+    multi_cast_size_a: int = 1
+    multi_cast_size_b: int = 1
+    _cpp_extra_names: ClassVar[tuple[str, ...]] = (
+        "num_threads",
+        "num_math_threads",
+        "num_load_threads",
+    )
+    _name_map = {
+        "use_mbarrier": "kUseMBarrier",
+        "use_tma_bs": "kUseTmaBS",
+        "use_tma_bzp": "kUseTmaBZP",
+    }
+    def __post_init__(self):
+        if self.use_warp_spec is None:
+            self.use_warp_spec = False
+        if self.use_tma is None:
+            self.use_tma = False
+        if self.use_mbarrier is None:
+            self.use_mbarrier = self.use_tma or self.use_warp_spec
+        if self.use_cp_async is None:
+            sm_version = torch.cuda.get_device_capability()
+            self.use_cp_async = sm_version[0] >= 8
+        self.num_math_threads = math.prod(self.block_shape) // math.prod(self.warp_shape) * 32
+        if self.use_warp_spec:
+            self.num_load_threads = 128
+            self.num_threads = self.num_math_threads + 128
+        else:
+            self.num_load_threads = self.num_math_threads
+            self.num_threads = self.num_math_threads
+        for name in dir(self):
+            if not name.startswith("use_tma_"):
+                continue
+            if not self.use_tma:
+                assert getattr(self, name) is not True
+            if getattr(self, name) is None:
+                setattr(self, name, self.use_tma)

humming/config/enum.py ADDED Viewed

@@ -0,0 +1,21 @@
+import enum
+class MmaType(enum.Enum):
+    MMA = "mma"
+    WGMMA = "wgmma"
+class WeightScaleType(enum.Enum):
+    GROUP = "group"
+    BLOCK = "block"
+    CHANNEL = "channel"
+    TENSOR = "tensor"
+    GROUP_TENSOR = "group_tensor"
+class GemmType(enum.Enum):
+    DENSE = "dense"
+    INDEXED = "indexed"
+    GROUPED_CONTIGUOUS = "grouped_contiguous"
+    GROUPED_MASKED = "grouped_masked"

humming/config/mma.py ADDED Viewed

@@ -0,0 +1,301 @@
+import math
+import re
+import humming.dtypes as dtypes
+from humming.config.enum import MmaType
+DTYPE_BIT_WIDTH_MAP = {
+    "f32": 32,
+    "s32": 32,
+    "f16": 16,
+    "bf16": 16,
+    "e4m3": 8,
+    "e5m2": 8,
+    "s8": 8,
+    "e2m1": 4,
+    "s4": 4,
+}
+DTYPE_MAP = {
+    dtypes.float32: "f32",
+    dtypes.int32: "s32",
+    dtypes.float16: "f16",
+    dtypes.bfloat16: "bf16",
+    dtypes.float8e4m3: "e4m3",
+    dtypes.float8e5m2: "e5m2",
+    dtypes.int8: "s8",
+    dtypes.float4e2m1: "e2m1",
+    dtypes.int4: "s4",
+}
+def calc_reg_count(rows, cols, ptx_dtype):
+    total_bits = rows * cols * DTYPE_BIT_WIDTH_MAP[ptx_dtype]
+    assert total_bits % (32 * 32) == 0
+    reg_count = total_bits // (32 * 32)
+    return reg_count
+class MmaOpClassImpl:
+    def __init__(self, m, n, k, a_dtype, b_dtype, cd_dtype):
+        self.shape = (m, n, k)
+        self.a_dtype = a_dtype if isinstance(a_dtype, str) else DTYPE_MAP[a_dtype]
+        self.b_dtype = b_dtype if isinstance(b_dtype, str) else DTYPE_MAP[b_dtype]
+        self.cd_dtype = cd_dtype if isinstance(cd_dtype, str) else DTYPE_MAP[cd_dtype]
+        self.reg_a_count = calc_reg_count(m, k, self.a_dtype)
+        self.reg_b_count = calc_reg_count(k, n, self.b_dtype)
+        self.reg_cd_count = calc_reg_count(m, n, self.cd_dtype)
+        if self.cd_dtype == "f16":
+            self.val_type_cd = "half"
+            self.reg_cd_type = "uint32_t"
+        elif self.cd_dtype == "bf16":
+            self.val_type_cd = "nv_bfloat16"
+            self.reg_cd_type = "uint32_t"
+        elif self.cd_dtype == "f32":
+            self.val_type_cd = "float"
+            self.reg_cd_type = "float"
+        elif self.cd_dtype == "s32":
+            self.val_type_cd = "int32_t"
+            self.reg_cd_type = "uint32_t"
+        else:
+            raise ValueError(f"Invalid cd_dtype: {cd_dtype}")
+    def to_cpp_str(self, include_class_name=False):
+        reg_cd_type = self.reg_cd_type
+        lines = [
+            "static constexpr MmaType kMmaType = MmaType::MMA;",
+            f"using MmaShape = Shape<{self.shape[0]}, {self.shape[1]}, {self.shape[2]}>;",
+            "",
+            f"using ValTypeC = {self.val_type_cd};",
+            f"using ValTypeD = {self.val_type_cd};",
+            "",
+            f"static constexpr uint32_t kATypeBits = {DTYPE_BIT_WIDTH_MAP[self.a_dtype]};",
+            f"static constexpr uint32_t kBTypeBits = {DTYPE_BIT_WIDTH_MAP[self.b_dtype]};",
+            f"static constexpr uint32_t kCTypeBits = {DTYPE_BIT_WIDTH_MAP[self.cd_dtype]};",
+            f"static constexpr uint32_t kDTypeBits = {DTYPE_BIT_WIDTH_MAP[self.cd_dtype]};",
+            "",
+            f"using ARegisters = uint32_t[{self.reg_a_count}];",
+            f"using BRegisters = uint32_t[{self.reg_b_count}];",
+            f"using CRegisters = {self.reg_cd_type}[{self.reg_cd_count}];",
+            f"using DRegisters = {self.reg_cd_type}[{self.reg_cd_count}];",
+            "",
+            "CUDA_INLINE",
+            f"static void fma(uint32_t *a, uint32_t *b, {reg_cd_type} *c, {reg_cd_type} *d) {{",
+            *self.generate_ptx(indent=2).strip("\n").split("\n"),
+            "};",
+        ]
+        code = "\n".join("  " + x if x else x for x in lines)
+        if include_class_name:
+            code = f"class MmaOpClass {{\n{code}\n}};"
+        return code
+    def generate_ptx(self, indent=0):
+        a_dtype = self.a_dtype
+        b_dtype = self.b_dtype
+        cd_dtype = self.cd_dtype
+        shape = self.shape
+        asm_op = f"mma.sync.aligned.m{shape[0]}n{shape[1]}k{shape[2]}.row.col"
+        asm_op += f".{cd_dtype}.{a_dtype}.{b_dtype}.{cd_dtype}"
+        if "s" in a_dtype:
+            asm_op += ".satfinite"
+        start = 0
+        end = 0
+        param_placeholders_list = []
+        counts = [self.reg_cd_count, self.reg_a_count, self.reg_b_count, self.reg_cd_count]
+        for i in range(len(counts)):
+            end += counts[i]
+            placeholder_str = ", ".join(f"%{x}" for x in range(start, end))
+            param_placeholders_list.append("{" + placeholder_str + "}")
+            start += counts[i]
+        a_params = []
+        b_params = []
+        c_params = []
+        d_params = []
+        for i in range(self.reg_a_count):
+            a_params.append(f' "r"(a[{i}])')
+        for i in range(self.reg_b_count):
+            b_params.append(f' "r"(b[{i}])')
+        for i in range(self.reg_cd_count):
+            t = "f" if cd_dtype == "f32" else "r"
+            c_params.append(f' "{t}"(c[{i}])')
+            d_params.append(f'"+{t}"(d[{i}])')
+        asm_code = f"""
+        asm volatile(
+          "{asm_op} "
+          "{", ".join(param_placeholders_list)};\\n"
+          : {", ".join(d_params)}
+          : {", ".join(a_params)},
+            {", ".join(b_params)},
+            {", ".join(c_params)}
+        );
+        """
+        space_count = len(re.findall("^\n( +)", asm_code)[0])
+        asm_code = asm_code.replace("\n" + " " * space_count, "\n").strip()
+        asm_code = "".join("\n" + " " * indent + x for x in asm_code.split("\n"))
+        return asm_code
+class WgmmaOpClassImpl:
+    def __init__(self, m, n, k, a_dtype, b_dtype, cd_dtype):
+        self.shape = (m, n, k)
+        self.a_dtype = a_dtype if isinstance(a_dtype, str) else DTYPE_MAP[a_dtype]
+        self.b_dtype = b_dtype if isinstance(b_dtype, str) else DTYPE_MAP[b_dtype]
+        self.cd_dtype = cd_dtype if isinstance(cd_dtype, str) else DTYPE_MAP[cd_dtype]
+        # Project B (registers) is sized (project N) x (project K) in b_dtype after
+        # the transpose — that's what fills the wgmma A register operand.
+        self.reg_b_count = calc_reg_count(n, k, self.b_dtype) // 4
+        self.reg_cd_count = calc_reg_count(m, n, self.cd_dtype) // 4
+        if self.cd_dtype == "f16":
+            self.val_type_cd = "half"
+            self.reg_cd_type = "uint32_t"
+        elif self.cd_dtype == "bf16":
+            self.val_type_cd = "nv_bfloat16"
+            self.reg_cd_type = "uint32_t"
+        elif self.cd_dtype == "f32":
+            self.val_type_cd = "float"
+            self.reg_cd_type = "float"
+        elif self.cd_dtype == "s32":
+            self.val_type_cd = "int32_t"
+            self.reg_cd_type = "uint32_t"
+        else:
+            raise ValueError(f"Invalid cd_dtype: {cd_dtype}")
+    def to_cpp_str(self, include_class_name=False):
+        reg_cd_type = self.reg_cd_type
+        lines = [
+            "static constexpr MmaType kMmaType = MmaType::WGMMA;",
+            f"using MmaShape = Shape<{self.shape[0]}, {self.shape[1]}, {self.shape[2]}>;",
+            "",
+            f"using ValTypeC = {self.val_type_cd};",
+            f"using ValTypeD = {self.val_type_cd};",
+            "",
+            f"static constexpr uint32_t kATypeBits = {DTYPE_BIT_WIDTH_MAP[self.a_dtype]};",
+            f"static constexpr uint32_t kBTypeBits = {DTYPE_BIT_WIDTH_MAP[self.b_dtype]};",
+            f"static constexpr uint32_t kCTypeBits = {DTYPE_BIT_WIDTH_MAP[self.cd_dtype]};",
+            f"static constexpr uint32_t kDTypeBits = {DTYPE_BIT_WIDTH_MAP[self.cd_dtype]};",
+            "",
+            f"using BRegisters = uint32_t[{self.reg_b_count}];",
+            f"using CRegisters = {self.reg_cd_type}[{self.reg_cd_count}];",
+            f"using DRegisters = {self.reg_cd_type}[{self.reg_cd_count}];",
+            "",
+            "CUDA_INLINE",
+            f"static void fma(uint64_t &desc, uint32_t *b, {reg_cd_type} *d, bool pred = true) {{",
+            *self.generate_ptx(indent=2, has_scale_d=True).strip("\n").split("\n"),
+            "};",
+        ]
+        code = "\n".join("  " + x if x else x for x in lines)
+        if include_class_name:
+            code = f"class MmaOpClass {{\n{code}\n}};"
+        return code
+    def generate_ptx(self, indent=2, has_scale_d=True):
+        a_dtype = self.a_dtype
+        b_dtype = self.b_dtype
+        cd_dtype = self.cd_dtype
+        m, n, k = self.shape
+        # Swap M<->N and A-dtype<->B-dtype in PTX: project's A becomes wgmma's B and
+        # project's B becomes wgmma's A. The PTX dtype suffix order is .cd.a.b, so
+        # the wgmma A slot takes project's b_dtype and the wgmma B slot takes a_dtype.
+        asm_op = f"wgmma.mma_async.sync.aligned.m{n}n{m}k{k}"
+        asm_op += f".{cd_dtype}.{b_dtype}.{a_dtype}"
+        # satfinite gates on the wgmma-A operand dtype (= project's B).
+        if "s" in b_dtype:
+            asm_op += ".satfinite"
+        start = 0
+        end = 0
+        param_placeholders_list = []
+        counts = [self.reg_cd_count, self.reg_b_count]
+        for i in range(len(counts)):
+            end += counts[i]
+            placeholder_str = ", ".join(f"%{x}" for x in range(start, end))
+            param_placeholders_list.append("{" + placeholder_str + "}")
+            start += counts[i]
+        param_placeholders_list.append(f"%{sum(counts)}")
+        other_ptx_args = ", p" if has_scale_d else ", 1"
+        # The dtype-specific PTX tail args (scale/trans flags) gate on the wgmma-A
+        # operand dtype, which after the swap is project's b_dtype.
+        if self.b_dtype in ["f16", "bf16"]:
+            other_ptx_args += ", 1, 1, 0"
+        elif self.b_dtype in ["e4m3", "e5m2", "e2m1"]:
+            other_ptx_args += ", 1, 1"
+        # Project A's smem descriptor fills the wgmma B operand.
+        a_desc_param = ' "l"(desc)'
+        # Project B's registers fill the wgmma A operand.
+        b_params = []
+        cd_params = []
+        for i in range(self.reg_b_count):
+            b_params.append(f' "r"(b[{i}])')
+        for i in range(self.reg_cd_count):
+            t = "f" if cd_dtype == "f32" else "r"
+            cd_params.append(f'"+{t}"(d[{i}])')
+        cd_param_str = ""
+        for i in range(math.ceil(len(cd_params) / 4)):
+            cd_params_part = cd_params[i * 4 : (i + 1) * 4]
+            cd_params_part_str = ", ".join(cd_params_part) + ",\n"
+            if cd_param_str:
+                cd_params_part_str = "    " + cd_params_part_str
+            cd_param_str += cd_params_part_str
+        cd_param_str = cd_param_str.strip().strip(",")
+        if has_scale_d:
+            asm_code = f"""
+            asm volatile(
+              "{{\\n"
+                ".reg .pred p;\\n"
+                "setp.ne.b32 p, %{sum(counts) + 1}, 0;\\n"
+                "{asm_op} "
+                "{", ".join(param_placeholders_list)}{other_ptx_args};\\n"
+              "}}\\n"
+              : {cd_param_str}
+              : {", ".join(b_params)},
+                {a_desc_param}, "r"((uint32_t)pred)
+            );
+            """
+        else:
+            asm_code = f"""
+            asm volatile(
+            "{asm_op} "
+            "{", ".join(param_placeholders_list)}{other_ptx_args};\\n"
+            : {cd_param_str}
+            : {", ".join(b_params)},
+                {a_desc_param}
+            );
+            """
+        space_count = len(re.findall("^\n( +)", asm_code)[0])
+        asm_code = asm_code.replace("\n" + " " * space_count, "\n").strip()
+        asm_code = "".join("\n" + " " * indent + x for x in asm_code.split("\n"))
+        return asm_code
+class MmaOpClass:
+    @classmethod
+    def from_config(cls, mma_type, m, n, k, a_dtype, b_dtype, cd_dtype):
+        mma_type = mma_type if isinstance(mma_type, MmaType) else getattr(MmaType, mma_type.upper())
+        if mma_type == MmaType.MMA:
+            return MmaOpClassImpl(m, n, k, a_dtype, b_dtype, cd_dtype)
+        elif mma_type == MmaType.WGMMA:
+            return WgmmaOpClassImpl(m, n, k, a_dtype, b_dtype, cd_dtype)
+        else:
+            raise ValueError(f"Invalid MMA Type: {mma_type}")