PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/quantize/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from fbgemm_gpu.quantize.quantize_ops import dequantize_mx, quantize_mx  # noqa F401
+from fbgemm_gpu.utils import TorchLibraryFragment
+lib = TorchLibraryFragment("fbgemm")
+lib.define(
+    """quantize_mx(
+        Tensor input,
+        int scale_bits,
+        int elem_ebits,
+        int elem_mbits,
+        float elem_max_norm,
+        int mx_group_size,
+        int? rounding_mode = None
+    ) -> Tensor
+    """
+)
+lib.define(
+    """dequantize_mx(
+        Tensor input,
+        int mx_group_size
+    ) -> Tensor
+    """
+)
+lib.register(
+    "quantize_mx",
+    {"CUDA": quantize_mx, "CPU": quantize_mx},
+)
+lib.register(
+    "dequantize_mx",
+    {"CUDA": dequantize_mx, "CPU": dequantize_mx},
+)

fbgemm_gpu/quantize/quantize_ops.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from typing import Union
+import torch
+from fbgemm_gpu.quantize_utils import fp32_to_mx4, mx4_to_fp32, RoundingMode
+def quantize_mx(
+    input: torch.Tensor,
+    scale_bits: int = 8,
+    elem_ebits: int = 2,
+    elem_mbits: int = 3,
+    elem_max_norm: float = 6.0,
+    mx_group_size: int = 32,
+    rounding_mode: Union[RoundingMode, int] = RoundingMode.even,
+) -> torch.Tensor:
+    """
+    Registered quantize_mx ops for E2E comm.
+    (registration is done in __init__.py)
+    We use Triton implementation for quantization
+    Args:
+        input: FP32 tensor of size total_elems to be quantized
+        scale_bits: num bits of the shared exponent (i.e., 8 for MX4 e2m1)
+        elem_ebits: num bits of the exponent (i.e., 2 for MX4 e2m1)
+        elem_mbits: num bits of the mantissa incl. sign and implicit bits (
+                    i.e., 3 for MX4 e2m1)
+        elem_max_norm: max value of the float (i.e., 6.0 for MX4 e2m1)
+        mx_group_size: num elements that share the max shared_exponent
+        rounding_mode: Which type of rounding to use when calculating shared exponent.
+    Return:
+        output: MX4 tensor packed into int8 values with size
+                (total_elems / 2 + total_elems / groupsize)
+                the shared exponent of each group is stored at the last byte
+                of output of each group
+    """
+    return fp32_to_mx4(
+        input, mx_group_size, rounding_mode=rounding_mode, use_triton=True
+    )
+def dequantize_mx(
+    input: torch.Tensor,
+    mx_group_size: int = 32,
+) -> torch.Tensor:
+    """
+    Registered dequantize_mx ops for E2E comm
+    (registration is done in __init__.py to prevent multiple loading)
+    We use triton implementation for quantization
+    Args:
+        input: FP8 tensor (MX4 packed in FP8)
+        mx_group_size: number of elements that shares the same max shared_exponent
+    Return:
+        output: FP32 tensor with total elements (total_elems)
+    """
+    return mx4_to_fp32(input, mx_group_size, use_triton=True)

fbgemm_gpu/quantize_comm.py ADDED Viewed

@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+# The code in this file is refactored from https://fburl.com/code/p2gy2gxb
+# based on "Amy Yang et al., Training Deep Learning Recommendation Model with
+# Quantized Collective Communications", DLP-KDD 2020.
+import logging
+from typing import Optional, TypeVar
+import torch
+from fbgemm_gpu.quantize_utils import (
+    bf16_to_fp32,
+    fp16_to_fp32,
+    fp32_to_bf16_with_clamp,
+    fp32_to_fp16_with_clamp,
+    fp32_to_hfp8_with_clamp,
+    fp32_to_mx4,
+    hfp8_to_fp32,
+    mx4_to_fp32,
+    RoundingMode,
+)
+from fbgemm_gpu.split_embedding_configs import SparseType
+from torch.autograd.profiler import record_function  # usort:skip
+from dataclasses import dataclass
+import fbgemm_gpu.quantize.quantize_ops  # noqa F401
+logger: logging.Logger = logging.getLogger()
+# FP8 configurations
+ebits, mbits, bias = 4, 3, 15
+max_pos: float = (2 ** ((1 << ebits) - 2 - bias)) * (2 - 2 ** (-mbits))
+# INT8 configurations
+ROW_DIM_DEFAULT = 32
+# MX4 configurations
+MX_GROUP_SIZE_DEFAULT = 32
+def none_throws(
+    # pyre-fixme[31]: Expression `typing.Optional[typing.TypeVar("_T")]` is not a
+    #  valid type.
+    optional: Optional[TypeVar("_T")],
+    message: str = "Unexpected `None`",
+    # pyre-fixme[31]: Expression `typing.TypeVar("_T")` is not a valid type.
+) -> TypeVar("_T"):
+    if optional is None:
+        raise AssertionError(message)
+    return optional
+@dataclass
+class QuantizationContext:
+    row_dim: int = ROW_DIM_DEFAULT
+    row_dim_quant: int = -1
+    mx_group_size: int = MX_GROUP_SIZE_DEFAULT
+    rounding_mode: Optional[RoundingMode] = RoundingMode.even
+    padded_dim_sum_per_rank: Optional[list[int]] = None
+def _quantize_tensor(
+    input_tensor: torch.Tensor,
+    comm_precision: SparseType,
+    ctx: Optional[QuantizationContext] = None,
+    is_fwd: bool = True,
+) -> torch.Tensor:
+    if comm_precision == SparseType.FP32:
+        return input_tensor
+    elif comm_precision == SparseType.FP16:
+        return fp32_to_fp16_with_clamp(input_tensor)
+    elif comm_precision == SparseType.BF16:
+        return fp32_to_bf16_with_clamp(input_tensor)
+    elif comm_precision == SparseType.FP8:
+        # return fp32_to_hfp8_with_clamp(input_tensor, ebits, mbits, bias)
+        if ctx is not None and ctx.row_dim > 0:
+            ctx = none_throws(ctx)
+            row_dim = ctx.row_dim
+            input_2d = input_tensor.view((-1, row_dim)) if row_dim > 0 else input_tensor
+            input_2d_quant = torch.ops.fbgemm.FloatToFP8RowwiseQuantized(
+                input_2d, is_fwd
+            )
+            row_dim_quant = input_2d_quant.shape[1]
+            input_quant_all2all = None
+            input_quant_all2all = input_2d_quant.view((-1))
+            ctx.row_dim_quant = row_dim_quant
+            return input_quant_all2all
+        else:
+            return fp32_to_hfp8_with_clamp(input_tensor, ebits, mbits, bias)
+    elif comm_precision == SparseType.INT8:
+        ctx = none_throws(ctx)
+        row_dim = ctx.row_dim
+        input_2d = input_tensor.view((-1, row_dim)) if row_dim > 0 else input_tensor
+        input_2d_quant = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(input_2d)
+        row_dim_quant = input_2d_quant.shape[1]
+        input_quant_all2all = None
+        input_quant_all2all = input_2d_quant.view((-1))
+        ctx.row_dim_quant = row_dim_quant
+        return input_quant_all2all
+    elif comm_precision == SparseType.MX4:
+        mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
+        rounding_mode = ctx.rounding_mode if ctx is not None else RoundingMode.even
+        return fp32_to_mx4(
+            input_tensor, mx_group_size, rounding_mode=rounding_mode
+        ).view(-1)
+    else:
+        raise ValueError(f"comm_precision={comm_precision} is not supported")
+def _dequantize_tensor(
+    quantized_tensor: torch.Tensor,
+    comm_precision: SparseType,
+    ctx: Optional[QuantizationContext] = None,
+    is_fwd: bool = True,
+    fp8_output_dtype: Optional[SparseType] = None,
+) -> torch.Tensor:
+    if comm_precision == SparseType.FP32:
+        assert quantized_tensor.dtype == torch.float
+        return quantized_tensor
+    elif comm_precision == SparseType.FP16:
+        assert quantized_tensor.dtype == torch.half
+        return fp16_to_fp32(quantized_tensor)
+    elif comm_precision == SparseType.BF16:
+        assert quantized_tensor.dtype == torch.bfloat16
+        return bf16_to_fp32(quantized_tensor)
+    elif comm_precision == SparseType.FP8:
+        if ctx is not None and ctx.row_dim > 0:
+            row_dim_quant = ctx.row_dim_quant
+            quantized_tensor_2d = quantized_tensor.view((-1, row_dim_quant))
+            # use provided fp8_output_dtype or default to FP32 (0)
+            output_dtype_int = (
+                fp8_output_dtype.as_int() if fp8_output_dtype is not None else 0
+            )
+            dequant_tensor = torch.ops.fbgemm.FP8RowwiseQuantizedToFloat(
+                quantized_tensor_2d,
+                is_fwd,
+                output_dtype_int,
+            )
+            return dequant_tensor.view(-1)
+        else:
+            assert quantized_tensor.dtype == torch.uint8
+            return hfp8_to_fp32(quantized_tensor, ebits, bias)
+    elif comm_precision == SparseType.INT8:
+        ctx = none_throws(ctx)
+        row_dim_quant = ctx.row_dim_quant
+        quantized_tensor_2d = quantized_tensor.view((-1, row_dim_quant))
+        dequant_tensor = torch.ops.fbgemm.Fused8BitRowwiseQuantizedToFloat(
+            quantized_tensor_2d
+        )
+        return dequant_tensor.view(-1)
+    elif comm_precision == SparseType.MX4:
+        mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
+        return mx4_to_fp32(quantized_tensor, mx_group_size)
+    else:
+        raise ValueError(f"comm_precision={comm_precision} is not supported")
+class QuantizedCommCodec:
+    # Concrete implementation of QuantizedCommCodec provided by FBGEMM functions.
+    def __init__(
+        self,
+        comm_precision: SparseType,
+        loss_scale: Optional[float] = None,
+        row_dim: Optional[int] = None,
+        is_fwd: bool = True,
+        rounding_mode: Optional[RoundingMode] = None,
+        fp8_output_dtype: Optional[SparseType] = None,
+    ) -> None:
+        if loss_scale is not None:
+            if comm_precision not in [SparseType.FP16, SparseType.BF16]:
+                logger.warning(
+                    f"Setting loss scale for comm_precision={comm_precision} is not supported. Overriding to None"
+                )
+                loss_scale = None
+        logger.info(
+            f"Creating QuantizedCommsCodec comm_precision:{comm_precision}, loss_scale:{loss_scale}"
+        )
+        self._comm_precision = comm_precision
+        self._loss_scale = loss_scale
+        self._is_fwd = is_fwd
+        self._row_dim: int = -1 if row_dim is None else row_dim
+        self._rounding_mode: Optional[RoundingMode] = rounding_mode
+        self._fp8_output_dtype: Optional[SparseType] = fp8_output_dtype
+        if self._comm_precision == SparseType.MX4:
+            self._row_dim = MX_GROUP_SIZE_DEFAULT if row_dim is None else row_dim
+            self._rounding_mode = (
+                RoundingMode.even if rounding_mode is None else rounding_mode
+            )
+    def encode(
+        self, input_tensor: torch.Tensor, ctx: Optional[QuantizationContext] = None
+    ) -> torch.Tensor:
+        if self._loss_scale is not None:
+            input_tensor = self._loss_scale * input_tensor
+        with record_function(
+            f"## encoder {self._comm_precision} {self._loss_scale} ##"
+        ):
+            output = _quantize_tensor(
+                input_tensor,
+                self._comm_precision,
+                ctx,
+                self._is_fwd,
+            )
+        return output
+    def decode(
+        self, input_tensor: torch.Tensor, ctx: Optional[QuantizationContext] = None
+    ) -> torch.Tensor:
+        if self._loss_scale is not None:
+            input_tensor = input_tensor / self._loss_scale
+        with record_function(
+            f"## decoder {self._comm_precision} {self._loss_scale} ##"
+        ):
+            dequantized_tensor = _dequantize_tensor(
+                input_tensor,
+                self._comm_precision,
+                ctx,
+                self._is_fwd,
+                fp8_output_dtype=self._fp8_output_dtype,
+            )
+        return dequantized_tensor
+    def calc_quantized_size(
+        self, input_len: int, ctx: Optional[QuantizationContext] = None
+    ) -> int:
+        # Use the same logic in _float_to_fused8bitrowwise_gpu_t()
+        if self._comm_precision == SparseType.INT8 or (
+            self._comm_precision == SparseType.FP8 and self._row_dim > 0
+        ):
+            ctx = none_throws(ctx)
+            torch._check(
+                input_len % ctx.row_dim == 0,
+                lambda: f"input_len  {input_len} is not a multiple of row dim {ctx.row_dim}",
+            )
+            assert input_len % ctx.row_dim == 0, (
+                f"input_len {input_len} is not a multiple of row dim {ctx.row_dim} "
+                "Please check your batch size (power of 2 batch size is recommended)"
+            )
+            nrows = input_len // ctx.row_dim
+            ncols = (ctx.row_dim + 3) // 4 * 4 + 2 * 4
+            return nrows * ncols
+        elif self._comm_precision == SparseType.MX4:
+            if ctx:
+                group_size = ctx.mx_group_size
+            else:
+                group_size = MX_GROUP_SIZE_DEFAULT
+            assert (
+                input_len % group_size == 0
+            ), f"input_len {input_len} needs to be multiple of group_size {group_size}"
+            # quantized output size = half input size + number of groups (shared exp)
+            ctx = none_throws(ctx)
+            return (input_len // 2) + (input_len // ctx.mx_group_size)
+        else:
+            return input_len
+    @property
+    def quantized_dtype(self) -> torch.dtype:
+        return self._comm_precision.as_dtype()
+    def create_context(self) -> Optional[QuantizationContext]:
+        # fp8 rowwise is activated when row_dim > 0
+        if self._comm_precision == SparseType.FP8:
+            return QuantizationContext(self._row_dim)
+        if self._comm_precision == SparseType.MX4:
+            return QuantizationContext(
+                row_dim=self._row_dim,
+                mx_group_size=self._row_dim,
+                rounding_mode=self._rounding_mode,
+            )
+        # int8 rowwise is default
+        return QuantizationContext()
+    def padded_size(
+        self,
+        input_tensor: torch.Tensor,
+        dim_per_rank: list[int],
+        my_rank: int,
+        qcomm_ctx: QuantizationContext,
+    ) -> tuple[int, int]:
+        if input_tensor.ndim == 1:
+            return input_tensor.shape[0], 0
+        # return padded size for the feature dimension (dim 1), 0 if no padding needed.
+        padded_dim_sum, padding_size = input_tensor.shape[1], 0
+        if self._comm_precision == SparseType.MX4:
+            group_size = qcomm_ctx.mx_group_size
+            padding_size_per_rank = [
+                group_size - (t if (t := dim_sum % group_size) > 0 else group_size)
+                for dim_sum in dim_per_rank
+            ]
+            padded_dim_sum_per_rank = [
+                a + b for a, b in zip(dim_per_rank, padding_size_per_rank)
+            ]
+            dim_sum, padding_size = (
+                dim_per_rank[my_rank],
+                padding_size_per_rank[my_rank],
+            )
+            assert input_tensor.ndim == 2 and input_tensor.shape[1] == dim_sum
+            qcomm_ctx.padded_dim_sum_per_rank = padded_dim_sum_per_rank
+            padded_dim_sum = padding_size + dim_sum
+            return padded_dim_sum, padding_size
+        return padded_dim_sum, padding_size

fbgemm_gpu/quantize_utils.py ADDED Viewed

@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import logging
+from typing import Optional, Union
+import torch  # isort:skip
+import fbgemm_gpu
+from fbgemm_gpu.triton import dequantize_mx4, quantize_mx4, RoundingMode
+from fbgemm_gpu.triton.quantize_ref import py_dequantize_mx4, py_quantize_mx4
+try:
+    # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
+    open_source = bool(getattr(fbgemm_gpu, "open_source", False))
+except NotImplementedError:
+    open_source = False
+# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
+if not open_source:
+    from mtia.kernels.triton.mx4.quantize import (
+        triton_dequantize_mx4 as mtia_dequantize_mx4,
+        triton_quantize_mx4 as mtia_quantize_mx4,
+    )
+logger: logging.Logger = logging.getLogger()
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+except Exception:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+TORCH_HALF_MIN: float = torch.finfo(torch.float16).min
+TORCH_HALF_MAX: float = torch.finfo(torch.float16).max
+TORCH_BFLOAT16_MIN: float = torch.finfo(torch.bfloat16).min
+TORCH_BFLOAT16_MAX: float = torch.finfo(torch.bfloat16).max
+def fp32_to_mx4(
+    tensor: torch.Tensor,
+    group_size: int = 32,
+    ebits: int = 2,
+    mbits: int = 1,
+    rounding_mode: Optional[Union[RoundingMode, int]] = RoundingMode.even,
+    stochastic_casting: bool = False,
+    use_triton: bool = True,
+) -> torch.Tensor:
+    """Quantize an FP32 tensor to MX4 with triton or native cuda impl.
+    Args:
+        tensor (torch.Tensor): FP32 tensor to quantize with M total elements.
+        group_size (int): Compute scale in chunks of group_size.
+        ebits (int): Number of exponent bits in target mx4 format.
+        mbits (int): Number of mantissa bits in target mx4 format.
+        rounding_mode (RoundingMode or int): Which type of rounding to use when computing exponent.
+        Only supported with use_triton=True.
+        stochastic_casting (bool): Whether to use stochastic casting when downcasting.
+        use_triton (bool): If set, use triton quantization, otherwise cuda.
+    Return:
+        output: MX4 tensor packed into int8 values with total elements (M / 2 + M / groupsize)
+    """
+    # Accelerated MX4 is only available on cuda, if input is on cpu, use python.
+    # Operate on flattened input.
+    if rounding_mode is None:
+        rounding_mode = RoundingMode.even
+    if not tensor.is_cuda and not tensor.is_mtia:
+        return py_quantize_mx4(
+            tensor,
+            group_size,
+            ebits=ebits,
+            mbits=mbits,
+            rounding_mode=rounding_mode,
+            stochastic_casting=stochastic_casting,
+        )
+    if use_triton:
+        if tensor.is_mtia:
+            return mtia_quantize_mx4(
+                tensor,
+                group_size,
+                ebits=ebits,
+                mbits=mbits,
+                rounding_mode=rounding_mode,
+                stochastic_casting=stochastic_casting,
+            )
+        return quantize_mx4(
+            tensor,
+            group_size,
+            ebits=ebits,
+            mbits=mbits,
+            rounding_mode=rounding_mode,
+            stochastic_casting=stochastic_casting,
+        )
+    else:
+        out = torch.ops.fbgemm.quantize_mx_cuda(
+            tensor.flatten(),
+            scale_bits=8,
+            elem_ebits=2,
+            elem_mbits=3,
+            elem_max_norm=6.0,
+            mx_group_size=group_size,
+        )
+        # Perserve input dimensions.
+        output_shape = list(tensor.shape[:-1]) + [-1]
+        return out.view(output_shape)
+def mx4_to_fp32(
+    tensor: torch.Tensor,
+    group_size: int = 32,
+    use_triton: bool = True,
+    ebits: int = 2,
+    mbits: int = 1,
+) -> torch.Tensor:
+    """Dequantize an MX4 tensor to FP32 with triton or native cuda impl.
+    Args:
+        tensor (torch.Tensor): MX4 packed tensor with total elements (M / 2 + M / groupsize)
+        group_size (int): Compute scale in chunks of group_size.
+        use_triton (bool): If set, use triton quantization, otherwise cuda.
+        ebits (int): Number of exponent bits in target mx4 format.
+        mbits (int): Number of mantissa bits in target mx4 format.
+    Return:
+        output: FP32 tensor with total elements (M).
+    """
+    # Accelerated MX4 dequantize is only available on cuda, if input is on cpu, use python.
+    if not tensor.is_cuda and not tensor.is_mtia:
+        return py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+    if use_triton:
+        if tensor.is_mtia:
+            return mtia_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        return dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+    else:
+        return torch.ops.fbgemm.dequantize_mx_cuda(tensor.flatten(), group_size)
+def fp32_to_fp16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.clamp(tensor, TORCH_HALF_MIN, TORCH_HALF_MAX).half()
+def fp32_to_bf16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.clamp(tensor, TORCH_BFLOAT16_MIN, TORCH_BFLOAT16_MAX).bfloat16()
+def fp32_to_hfp8_with_clamp(
+    tensor: torch.Tensor, ebits: int = 4, mbits: int = 3, bias: int = 15
+) -> torch.Tensor:
+    max_pos: float = (2 ** ((1 << ebits) - 2 - bias)) * (2 - 2 ** (-mbits))
+    return torch.ops.fbgemm.FloatToHFP8Quantized(
+        tensor.contiguous(),
+        ebits,
+        bias,
+        max_pos,
+    )
+def fp16_to_fp32(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.float()
+def bf16_to_fp32(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.view(torch.bfloat16).float()
+def hfp8_to_fp32(tensor: torch.Tensor, ebits: int = 4, bias: int = 15) -> torch.Tensor:
+    return torch.ops.fbgemm.HFP8QuantizedToFloat(
+        tensor.contiguous().view(torch.uint8),
+        ebits,
+        bias,
+    )
+def measure_fp16_quant_error(input_tensor: torch.Tensor) -> None:
+    # TODO: log to tensorboard
+    num_nan_fp32_tensor = torch.numel(input_tensor[torch.isnan(input_tensor)])
+    logger.info(
+        "num NaN in fp32 tensor: {}, ratio: {}.".format(
+            num_nan_fp32_tensor, num_nan_fp32_tensor / torch.numel(input_tensor)
+        )
+    )
+    logger.info(
+        "fp32 tensor profile: min: {}, max: {}, min abs:{}, max abs:{}.".format(
+            torch.min(input_tensor),
+            torch.max(input_tensor),
+            torch.min(torch.abs(input_tensor)),
+            torch.max(torch.abs(input_tensor)),
+        )
+    )
+    fp16_tensor = fp32_to_fp16_with_clamp(input_tensor)
+    num_nan_fp16_tensor = torch.numel(fp16_tensor[torch.isnan(fp16_tensor)])
+    logger.info(
+        "num NaN in fp16 tensor: {}, ratio: {}.".format(
+            num_nan_fp16_tensor, num_nan_fp16_tensor / torch.numel(input_tensor)
+        )
+    )
+    diff = torch.abs(input_tensor - fp16_tensor.float())
+    rel_diff = diff / torch.abs(input_tensor)
+    logger.info(
+        "fp32_to_fp16 abs error: min={}, max={}, avg={}.".format(
+            torch.min(diff), torch.max(diff), torch.mean(diff)
+        )
+    )
+    rel_diff_not_nan = rel_diff[torch.logical_not(torch.isnan(rel_diff))]
+    logger.info(
+        "fp32_to_fp16 rel error: min={}, max={}, avg={}.".format(
+            torch.min(rel_diff_not_nan),
+            torch.max(rel_diff_not_nan),
+            torch.mean(rel_diff_not_nan),
+        )
+    )
+    rel_diff_1_idx = torch.where(rel_diff == 1.0)
+    fp32_rel_err_1_vals = input_tensor[rel_diff_1_idx]
+    if torch.numel(fp32_rel_err_1_vals) > 0:
+        fp32_rel_err_1_vals = torch.abs(fp32_rel_err_1_vals)
+        logger.info(
+            "fp32_to_fp16 rel error == 1: fp32 min:{}, fp32 max:{}, fp32 avg:{}.".format(
+                torch.min(fp32_rel_err_1_vals),
+                torch.max(fp32_rel_err_1_vals),
+                torch.mean(fp32_rel_err_1_vals),
+            )
+        )
+        subrange_ratio = torch.numel(fp16_tensor[rel_diff_1_idx]) / torch.numel(
+            fp16_tensor
+        )
+        logger.info("sub fp16 range ratio: {}".format(subrange_ratio))