PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.10.5__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.10.7__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.10.5__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.10.7__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (11) hide show

fbgemm_gpu/asmjit.so CHANGED Viewed

Binary file

fbgemm_gpu/docs/version.py CHANGED Viewed

@@ -6,6 +6,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-__version__: str = "2025.10.5"
+__version__: str = "2025.10.7"
 __target__: str = "genai"
 __variant__: str = "cuda"

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py CHANGED Viewed

@@ -1239,6 +1239,55 @@ def triton_rms_quantize_mx4_unpack(
     return out.view(list(orig_shape[:-1]) + [-1]), scale
+@triton.jit
+def _fp32_to_e8m0(
+    unscale,
+    mbits: tl.constexpr,
+    scale_round_mode: tl.constexpr,
+):
+    E8M0_EXPONENT_BIAS: tl.constexpr = 127  # type: ignore[Incompatible variable type]
+    sign = tl.where(unscale < 0, -1.0, 1.0)
+    abs_tensor = tl.abs(unscale)
+    # MBITS_F32 = 23
+    if scale_round_mode == "even":
+        val_to_add = (1 << (23 - mbits - 1)) - 1
+    elif scale_round_mode == "ceil":
+        val_to_add = (1 << 23) - 1
+    else:
+        val_to_add = 0
+    mask_exponent = ((1 << (8 + 1)) - 1) << 23
+    mask_mantissa = (1 << 23) - 1
+    fp32_bits = tl.extra.cuda.libdevice.float_as_int(abs_tensor)
+    fp32_bits_exp = (fp32_bits + val_to_add) & mask_exponent
+    exponent = (fp32_bits_exp >> 23) & 0xFF
+    if scale_round_mode == "nv_round":
+        mantissa = fp32_bits & mask_mantissa
+        is_denormal = (exponent == 0) & (mantissa != 0)
+        is_normal = ~is_denormal
+        condition1 = is_normal & (exponent < 254) & (mantissa > 0)
+        condition2 = is_denormal & (mantissa / (2**23) > 0.5)
+        exponent = tl.where(condition1 | condition2, exponent + 1, exponent)
+    exponent = exponent.to(tl.float32)
+    e8m0_values = sign * tl.exp2(exponent - E8M0_EXPONENT_BIAS)
+    unscale = e8m0_values
+    # In case unscale=0 (scale will be inf), or unscale=inf or nan, we set the scale to 1.0
+    unscale_invalid_mask = (
+        (e8m0_values == 0)
+        | (e8m0_values == float("inf"))
+        | (e8m0_values == float("nan"))
+    )
+    unscale = tl.where(unscale_invalid_mask, 1.0, unscale)
+    return unscale
 @triton.jit
 def _kernel_nvfp4_quantize(
     A,
@@ -1261,6 +1310,7 @@ def _kernel_nvfp4_quantize(
     GROUP_LOAD: tl.constexpr,
     USE_INT64: tl.constexpr,
     SCALE_K: tl.constexpr,
+    USE_E8M0_SCALE: tl.constexpr,
 ) -> None:
     """Quantize a 1D float tensor into a packed MX4 tensor.
@@ -1282,6 +1332,8 @@ def _kernel_nvfp4_quantize(
         FP4_EXP_BIAS (int): Exponent bias of target mx4 format.
         GROUP_LOAD (int): Number of groups to process simultaneously.
         USE_INT64 (bool): Whether to use int64 for indexing. This is needed for large tensors.
+        USE_E8M0_SCALE (bool): Whether to use E8M0 for quantization
+            (set to True when we want to mimic mx4's e8m0 scaling factor in nvfp4's fp8 local scale)
     """
     # Define Constant Expressions.
     BF16_MIN_NORMAL: tl.constexpr = 2 ** (-126)  # type: ignore[Incompatible variable type]
@@ -1347,7 +1399,12 @@ def _kernel_nvfp4_quantize(
         group_max = tl.max(tl.abs(a_groups), axis=1).to(tl.float32)
         # Next we scale A in preparation for quantization.
-        scale_ = (group_max / 6.0 * input_global_scale).to(tl.float8e4nv)
+        if USE_E8M0_SCALE:
+            scale_fp32 = group_max / 4.0 * input_global_scale
+            scale_fp32 = _fp32_to_e8m0(scale_fp32, mbits=1, scale_round_mode="even")
+        else:
+            scale_fp32 = group_max / 6.0 * input_global_scale
+        scale_ = scale_fp32.to(tl.float8e4nv)
         # Prevent infinite values in log.
         group_max = tl.where(group_max == 0, BF16_MIN_NORMAL, group_max)
@@ -1447,6 +1504,7 @@ def triton_scale_nvfp4_quant(
     rounding_mode: Union[RoundingMode, int] = RoundingMode.ceil,
     stochastic_casting: bool = False,
     EPS: float = 1e-5,
+    use_e8m0_scale: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize a tensor to nvfp4 format using efficient triton kernels.
@@ -1459,7 +1517,8 @@ def triton_scale_nvfp4_quant(
         rounding_mode (Union[RoundingMode, int]): Which type of rounding to use
         when calculating shared exponent. Defaults to pre-rounding to nearest even int.
         stochastic_casting (bool): Whether to use stochastic casting.
+        use_e8m0_scale (bool): Whether to use E8M0 for quantization
+            (set to True when we want to mimic mx4's e8m0 scaling factor in nvfp4's fp8 local scale)
     Returns:
         torch.Tensor: [M / 2] nvfp4 scaled tensor packed into int8
         torch.Tensor: [M / group_size] nvfp4 shared exponents into int8
@@ -1567,6 +1626,8 @@ def triton_scale_nvfp4_quant(
         USE_INT64=use_int64,
         # pyre-ignore[6]
         SCALE_K=rounded_K,
+        # pyre-ignore[6]
+        USE_E8M0_SCALE=use_e8m0_scale,
     )
     scale = scale.flatten()

fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/sparse_ops.py CHANGED Viewed

@@ -49,7 +49,7 @@ except Exception:
 import torch.utils._pytree as pytree
 from torch import SymInt, Tensor
-from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+from torch.fx.experimental.symbolic_shapes import guard_or_true
 if hasattr(torch.library, "register_fake"):
@@ -251,7 +251,7 @@ def tbe_input_combine_abstract(
         torch._check(index.is_contiguous())
         torch._check(offset.is_contiguous())
         total_indices = total_indices + index.numel()
-        if guard_size_oblivious(weight.numel() > 0):
+        if guard_or_true(weight.numel() > 0):
             torch._check(weight.dim() == 1)
             torch._check(weight.numel() == index.numel())
             torch._check(weight.is_contiguous())
@@ -288,7 +288,7 @@ def tbe_input_combine_with_length_abstract(
         torch._check(offset.is_contiguous())
         total_indices = total_indices + index.numel()
         total_offsets = total_offsets + offset.numel()
-        if guard_size_oblivious(weight.numel() > 0):
+        if guard_or_true(weight.numel() > 0):
             torch._check(weight.dim() == 1)
             torch._check(weight.numel() == index.numel())
             torch._check(weight.is_contiguous())
@@ -807,7 +807,7 @@ def batch_index_select_dim0_forward_cpu_impl_abstract(
     torch._check(num_inputs == len(input_rows))
     torch._check(num_inputs == len(input_columns))
-    if permute_output_dim_0_1 and guard_size_oblivious(len(input_num_indices) > 0):
+    if permute_output_dim_0_1 and guard_or_true(len(input_num_indices) > 0):
         # All num_indices must be the same if permute_output_dim_0_1 is True
         for x in input_num_indices:
             torch._check(x == input_num_indices[0])

{fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_genai_nightly
-Version: 2025.10.5
+Version: 2025.10.7
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org

{fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 fbgemm_gpu/__init__.py,sha256=FdQCmpvETH80tlIPP6W8MrOmzLaX9eoGY-fuHtVPbj0,5747
-fbgemm_gpu/asmjit.so,sha256=tp-5cN7HUYo7cjvR_kl_vfPBSEv78-IQxdvHN-nXFAM,501728
+fbgemm_gpu/asmjit.so,sha256=yDq47YobRro7Tvd4IaPNyQUf1YaA8iLyfcwnUdh0Coo,484232
 fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
 fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
-fbgemm_gpu/fbgemm.so,sha256=OAAQh-pMK3NUK0QFpDpsWWAGWMNr4tOrS1zN_N6paSU,5642616
+fbgemm_gpu/fbgemm.so,sha256=E4-lI4QpwkjkPmH3u1IKBUjBEDrdbL6YgeFnhIt5YKo,5811328
 fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
 fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
 fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
 fbgemm_gpu/quantize_comm.py,sha256=NqjKcQkieCrWH2HvxF8oTfzlgMA6sK9rHEUrSuCn5w4,11492
 fbgemm_gpu/quantize_utils.py,sha256=q8Aokk6nlHbXF6HcDBbhBCAGSZV4klM8uPF-MUFFtAw,8324
 fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
-fbgemm_gpu/sparse_ops.py,sha256=kEwe7Mev6o4RXYwZK9a0ksPgJJSSPvCkNbFwl_MTl_s,48476
+fbgemm_gpu/sparse_ops.py,sha256=VYm_3f-Z-59b3gPS2aykbNI-d_HXAIvlPjtU-EL9tlY,48448
 fbgemm_gpu/split_embedding_configs.py,sha256=fv29efZGD_cvh5KwdvTFD6GZtqJLYjWXW_0vMeyT_6k,15483
 fbgemm_gpu/split_embedding_inference_converter.py,sha256=AghGW22MgMsdHzdwdPMPYDjgas5AE_estckY8rMgXVU,7056
 fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
@@ -32,18 +32,18 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
-fbgemm_gpu/docs/version.py,sha256=axMqnt_uxHLVuIT4M2QVOCEQgEYGPpnbD2G5jg9tAXA,316
+fbgemm_gpu/docs/version.py,sha256=l0fTZZUWsGJgrEdtJbnCWLlUVNlJ1cmhFuAR4Maj8Sg,316
 fbgemm_gpu/experimental/example/__init__.py,sha256=V_XrGMq2oNVMpzwe1srlaTaHeIcZJw5oAGbo3seM_Ks,870
-fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=-IkuW8ZgEVlnqdY4NOqmY-3WmdwxrhcNPjDAWcQnLmw,243904
+fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=K70etfWeSleFOhfxXvSmpZMYBn_xmpvSxgdcGenvaKo,232488
 fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
 fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=AqHefiOaN_SjP5ew7RYGuKFuSlhedOJL_6f97TtLv7c,566
-fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=qJmQOBa9iW-HhRYm8lzE36Lz7vpBevCS6pWQyy33pag,213404
+fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=2RjIDSzUXtoFoC2ryp-C-j5H83mbSjPwvsvTrThfrqE,215658
 fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
 fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
 fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
 fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
 fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=qwfuF5E5K4oDiH7RJkpC7zth3kAsG7wv_glCl2A_G2A,1860
-fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=Nuoq1hmi6Khjn9feBvjWA23JYN0a8DltMGBcWiXiML0,78620888
+fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=7wbyCShChe1DVPKlqLliGPlpqo8U5AScgXWLllN9ZWY,77952696
 fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=oExepXpjMOwM43gARZARY0UtR-EX2zqRnSrOaQPy448,1044
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
 fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
 list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
 list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
-fbgemm_gpu_genai_nightly-2025.10.5.dist-info/METADATA,sha256=rTogIn95pgowlTBehwDMPRA5MmXP09AbRW_k4y12u84,2655
-fbgemm_gpu_genai_nightly-2025.10.5.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
-fbgemm_gpu_genai_nightly-2025.10.5.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
-fbgemm_gpu_genai_nightly-2025.10.5.dist-info/RECORD,,
+fbgemm_gpu_genai_nightly-2025.10.7.dist-info/METADATA,sha256=EpIY3ocq310OVN4Ma3kvReWnhF0OBb0syWTn5dY2S7M,2655
+fbgemm_gpu_genai_nightly-2025.10.7.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
+fbgemm_gpu_genai_nightly-2025.10.7.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
+fbgemm_gpu_genai_nightly-2025.10.7.dist-info/RECORD,,

{fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/top_level.txt RENAMED Viewed

File without changes