PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2026.1.4__cp313-cp313-manylinux_2_28_x86_64.whl → 2026.1.9__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2026.1.4__cp313-cp313-manylinux_2_28_x86_64.whl → 2026.1.9__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

fbgemm_gpu/docs/target.genai.json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "version": "2026.1.4",
+    "version": "2026.1.9",
     "target": "genai",
     "variant": "cuda"
 }

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/quantize_comm.py CHANGED Viewed

@@ -25,7 +25,7 @@ from fbgemm_gpu.quantize_utils import (
     fp32_to_hfp8_with_clamp,
     fp32_to_mx4,
     hfp8_to_fp32,
-    mx4_to_fp32,
+    mx4_to_float,
     RoundingMode,
 )
@@ -123,7 +123,7 @@ def _dequantize_tensor(
     comm_precision: SparseType,
     ctx: Optional[QuantizationContext] = None,
     is_fwd: bool = True,
-    fp8_output_dtype: Optional[SparseType] = None,
+    output_dtype: Optional[SparseType] = None,
 ) -> torch.Tensor:
     if comm_precision == SparseType.FP32:
         assert quantized_tensor.dtype == torch.float
@@ -138,10 +138,8 @@ def _dequantize_tensor(
         if ctx is not None and ctx.row_dim > 0:
             row_dim_quant = ctx.row_dim_quant
             quantized_tensor_2d = quantized_tensor.view((-1, row_dim_quant))
-            # use provided fp8_output_dtype or default to FP32 (0)
-            output_dtype_int = (
-                fp8_output_dtype.as_int() if fp8_output_dtype is not None else 0
-            )
+            # use provided output_dtype or default to FP32 (0)
+            output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
             dequant_tensor = torch.ops.fbgemm.FP8RowwiseQuantizedToFloat(
                 quantized_tensor_2d,
                 is_fwd,
@@ -161,7 +159,7 @@ def _dequantize_tensor(
         return dequant_tensor.view(-1)
     elif comm_precision == SparseType.MX4:
         mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
-        return mx4_to_fp32(quantized_tensor, mx_group_size)
+        return mx4_to_float(quantized_tensor, mx_group_size, output_dtype=output_dtype)
     else:
         raise ValueError(f"comm_precision={comm_precision} is not supported")
@@ -175,7 +173,7 @@ class QuantizedCommCodec:
         row_dim: Optional[int] = None,
         is_fwd: bool = True,
         rounding_mode: Optional[RoundingMode] = None,
-        fp8_output_dtype: Optional[SparseType] = None,
+        output_dtype: Optional[SparseType] = None,
     ) -> None:
         if loss_scale is not None:
             if comm_precision not in [SparseType.FP16, SparseType.BF16]:
@@ -193,7 +191,7 @@ class QuantizedCommCodec:
         self._is_fwd = is_fwd
         self._row_dim: int = -1 if row_dim is None else row_dim
         self._rounding_mode: Optional[RoundingMode] = rounding_mode
-        self._fp8_output_dtype: Optional[SparseType] = fp8_output_dtype
+        self._output_dtype: Optional[SparseType] = output_dtype
         if self._comm_precision == SparseType.MX4:
             self._row_dim = MX_GROUP_SIZE_DEFAULT if row_dim is None else row_dim
             self._rounding_mode = (
@@ -229,7 +227,7 @@ class QuantizedCommCodec:
                 self._comm_precision,
                 ctx,
                 self._is_fwd,
-                fp8_output_dtype=self._fp8_output_dtype,
+                output_dtype=self._output_dtype,
             )
         return dequantized_tensor

fbgemm_gpu/quantize_utils.py CHANGED Viewed

@@ -14,9 +14,15 @@ import torch  # isort:skip
 import fbgemm_gpu
-from fbgemm_gpu.triton import dequantize_mx4, quantize_mx4, RoundingMode
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.triton.common import RoundingMode
 from fbgemm_gpu.triton.quantize_ref import py_dequantize_mx4, py_quantize_mx4
+if torch.cuda.is_available():
+    from fbgemm_gpu.triton import quantize_mx4
+    from fbgemm_gpu.triton.quantize import triton_dequantize_mx4
 try:
     # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
     open_source = bool(getattr(fbgemm_gpu, "open_source", False))
@@ -126,25 +132,71 @@ def mx4_to_fp32(
 ) -> torch.Tensor:
     """Dequantize an MX4 tensor to FP32 with triton or native cuda impl.
+    This function is kept for backward compatibility and always returns FP32.
+    For BF16 output, use mx4_to_float() with output_dtype=SparseType.BF16.
+    """
+    return mx4_to_float(
+        tensor,
+        group_size,
+        use_triton,
+        ebits,
+        mbits,
+        output_dtype=None,  # None = FP32 default for backward compatibility
+    )
+def mx4_to_float(
+    tensor: torch.Tensor,
+    group_size: int = 32,
+    use_triton: bool = True,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: Optional[SparseType] = None,
+) -> torch.Tensor:
+    """Dequantize an MX4 tensor to FP32 or BF16 with triton or native cuda impl.
     Args:
         tensor (torch.Tensor): MX4 packed tensor with total elements (M / 2 + M / groupsize)
         group_size (int): Compute scale in chunks of group_size.
         use_triton (bool): If set, use triton quantization, otherwise cuda.
         ebits (int): Number of exponent bits in target mx4 format.
         mbits (int): Number of mantissa bits in target mx4 format.
+        output_dtype (Optional[SparseType]): Output dtype (FP32 or BF16).
+            Defaults to None (FP32) for backward compatibility.
     Return:
-        output: FP32 tensor with total elements (M).
+        output: Tensor with dtype matching output_dtype and total elements (M).
     """
+    # Validate output_dtype
+    supported_dtypes = {SparseType.FP32, SparseType.BF16}
+    if output_dtype is not None and output_dtype not in supported_dtypes:
+        raise ValueError(
+            f"output_dtype must be one of {supported_dtypes}, got {output_dtype}. "
+            f"FP16 is not supported due to potential overflow/underflow with MX4's wide exponent range. "
+            f"Use BF16 for memory savings with same dynamic range as FP32."
+        )
+    target_dtype = (
+        output_dtype.as_dtype() if output_dtype is not None else torch.float32
+    )
     # Accelerated MX4 dequantize is only available on cuda, if input is on cpu, use python.
     if not tensor.is_cuda and not tensor.is_mtia:
-        return py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        result = py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        return result.to(target_dtype) if output_dtype is not None else result
     if use_triton:
         if tensor.is_mtia:
-            return mtia_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
-        return dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+            return mtia_dequantize_mx4(
+                tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+            )
+        return triton_dequantize_mx4(
+            tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+        )
     else:
-        return torch.ops.fbgemm.dequantize_mx_cuda(tensor.flatten(), group_size)
+        output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
+        return torch.ops.fbgemm.dequantize_mx_cuda(
+            tensor.flatten(), group_size, output_dtype_int
+        )
 def fp32_to_fp16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:

fbgemm_gpu/triton/quantize.py CHANGED Viewed

@@ -575,7 +575,7 @@ def _kernel_dequantize_mx4(
         # Write final outputs.
         tl.store(
             out + output_offset,
-            scaled_fp32,
+            scaled_fp32.to(out.dtype.element_ty),
             # Mask values that are out of this chunk or the main array.
             mask=(output_offset < OUTPUT_SIZE)
             & (output_offset < OUTPUT_CHUNK_SIZE * (pid + 1)),
@@ -588,10 +588,14 @@ def _kernel_dequantize_mx4(
 def triton_dequantize_mx4(
-    a: torch.Tensor, group_size: int = 32, ebits: int = 2, mbits: int = 1
+    a: torch.Tensor,
+    group_size: int = 32,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """
-    Dequantize a tensor from mx4 format to fp32.
+    Dequantize a tensor from mx4 format to fp32 or bf16.
     Args:
         a (Tensor): [M / 2 + M / group_size] MX4 tensor packed into int8 values
@@ -599,13 +603,15 @@ def triton_dequantize_mx4(
         group_size (int): Size of chunks that use the same shared exponent.
         ebits (int): Number of bits to use for exponent in target mx4 format.
         mbits (int): Number of bits to use for mantissa in target mx4 format.
+        output_dtype (torch.dtype): Output dtype (FP32 or BF16).
+            Defaults to torch.float32 for backward compatibility.
     Returns:
-        torch.Tensor: [M, K] dequantized fp32 tensor.
+        torch.Tensor: [M, K] dequantized tensor in the specified dtype.
     """
     # If given an empty shape, return an empty tensor.
     if a.numel() == 0:
-        return torch.empty(a.shape, device=a.device, dtype=torch.float32)
+        return torch.empty(a.shape, device=a.device, dtype=output_dtype)
     # View a as 2D for simplicity.
     orig_shape = a.shape
     a = a.flatten()
@@ -622,9 +628,9 @@ def triton_dequantize_mx4(
     # Use a lookup table to convert
     mx4_to_fp_values = get_mx4_lookup_table(ebits, mbits, a.device)
-    # Create output tensor.
+    # Create output tensor in target dtype.
     output_elems = num_groups * group_size
-    out = torch.empty([output_elems], device=a.device, dtype=torch.float)
+    out = torch.empty([output_elems], device=a.device, dtype=output_dtype)
     # Check if we need to use int64 for indexing.
     use_int64 = num_threads * groups_per_thread * group_size > 2**31 - 1
     # Invoke triton dequantization kernel over rows.

{fbgemm_gpu_genai_nightly-2026.1.4.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_genai_nightly
-Version: 2026.1.4
+Version: 2026.1.9
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org

{fbgemm_gpu_genai_nightly-2026.1.4.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/RECORD RENAMED Viewed

@@ -2,12 +2,12 @@ fbgemm_gpu/__init__.py,sha256=bL2dL7uYeXb1GvdjIDUTcLXLRGNfmnI4MQoE3-Gg5m8,6361
 fbgemm_gpu/asmjit.so,sha256=UxnhHlu9LgmoRXa8fZwSX56b5QKffBxfAOs0AZLxRfk,501728
 fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
 fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
-fbgemm_gpu/fbgemm.so,sha256=U864UANx-CVyFYk5ADawCd0uWRfntHaVcyl6AVty_3Q,5642616
+fbgemm_gpu/fbgemm.so,sha256=HQUXhk9ikCtui6125NmRjOJvAMaWRgZXLlBcQHsh2Xo,5646712
 fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
 fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
 fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
-fbgemm_gpu/quantize_comm.py,sha256=ZfXtRHfqpVpV6k2PDL6oTUkKYzopqAV2M6vavp_RLSM,12022
-fbgemm_gpu/quantize_utils.py,sha256=q8Aokk6nlHbXF6HcDBbhBCAGSZV4klM8uPF-MUFFtAw,8324
+fbgemm_gpu/quantize_comm.py,sha256=j4-wBqWRtXjhtQBKi7IOAftNDzv8-AeX9YXlD8e682c,11983
+fbgemm_gpu/quantize_utils.py,sha256=fK4Dk9Qpjsu4qASCwAxkLjbiFRLI71Hd-AtHY4NyMZ8,10200
 fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
 fbgemm_gpu/sparse_ops.py,sha256=_EJC1pAbNnAnVQQ5JBg4DAV2TboIj-4XQkiKMmg1vXI,50417
 fbgemm_gpu/split_embedding_configs.py,sha256=EuVFKIDrgRQpRC5mmB4Du6WftK5GXJvDue9_ezt_eBI,16575
@@ -32,9 +32,9 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
-fbgemm_gpu/docs/target.genai.json.py,sha256=5TMzQCJ6eRjDaUActAOucxjizI7IZg56rn512-ujiE4,77
+fbgemm_gpu/docs/target.genai.json.py,sha256=TVO8vYaBQPaEdT-bYeXlTdOGiTw4ceWeAWa9m9Wnerg,77
 fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
-fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=y0Z22D1LnOkH9vXtRVPYWJ5raZC27OTViPEtnqi8TyY,190656
+fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=9NH_0L5RRD5NwIOILKiOGjKo25isXYYkmwFvbIlUGe0,190656
 fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
 fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
 fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=R4VNZdPSgmRmwDfTt2CShED2SGUF6dCXSUW2C4LISgE,215713
@@ -43,7 +43,7 @@ fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=5ClZ-GDrx6q0uaqW
 fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
 fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
 fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
-fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=2iHWrQDzhysRNMPbjFQpsxNdAkIRq__vTHy75sa4kJo,65238760
+fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=_1HEs4E69AIPZ7zauPqcHB01egv5eNf2IKNZuihMOAA,65230568
 fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=-R_LxyHpdXMILU9TNuYoRisBCkfK0_VLyixefaeZf4g,1463
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=gbhNU3mDTKJb3yt3inIDbiUjX_SG1oZfzgDygtHvMpk,10101
@@ -111,7 +111,7 @@ fbgemm_gpu/tbe/utils/quantize.py,sha256=icN2MXnl5rNqtKhGKkjpelx5pYBMYUv-6CrghxeV
 fbgemm_gpu/tbe/utils/requests.py,sha256=rQkEoaUUWEYCQM-1K_Lxg1wPcyIVw8sbdaGFTpsaE5I,18040
 fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
 fbgemm_gpu/triton/common.py,sha256=wnkLd2a8fKpefymLL-LjNKEL4hDVSxFiF5g3aF8mzsw,2131
-fbgemm_gpu/triton/quantize.py,sha256=z3y74-DCbGcQDsO70b2jK_HQDIYC0UJ7IEG2vvMu0_Y,26816
+fbgemm_gpu/triton/quantize.py,sha256=bjMPgcUOcuG_d9I_EjSCpkU3Fr5f3FU7CIWIqzc_N3w,27074
 fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2MWS80,11553
 fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
 fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=F2eQWjkWMR5RWQ48oIr-8OU_CRZyLazDpT7DFrDWS6g,29871
@@ -122,7 +122,7 @@ fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg
 fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
 list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
 list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
-fbgemm_gpu_genai_nightly-2026.1.4.dist-info/METADATA,sha256=MjhefCkWlccqGa-waygmSKkW1vaKWbpxX1U8VLRrMJ0,2655
-fbgemm_gpu_genai_nightly-2026.1.4.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
-fbgemm_gpu_genai_nightly-2026.1.4.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
-fbgemm_gpu_genai_nightly-2026.1.4.dist-info/RECORD,,
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/METADATA,sha256=WQ7sQGvWWGQao3Wcjk39i_bFY2BvrmWfGHxHgWxEsug,2655
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/RECORD,,

{fbgemm_gpu_genai_nightly-2026.1.4.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_genai_nightly-2026.1.4.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes