fbgemm-gpu-genai-nightly 2025.10.5__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.10.7__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

fbgemm_gpu/asmjit.so CHANGED
Binary file
@@ -6,6 +6,6 @@
6
6
  # This source code is licensed under the BSD-style license found in the
7
7
  # LICENSE file in the root directory of this source tree.
8
8
 
9
- __version__: str = "2025.10.5"
9
+ __version__: str = "2025.10.7"
10
10
  __target__: str = "genai"
11
11
  __variant__: str = "cuda"
@@ -1239,6 +1239,55 @@ def triton_rms_quantize_mx4_unpack(
1239
1239
  return out.view(list(orig_shape[:-1]) + [-1]), scale
1240
1240
 
1241
1241
 
1242
+ @triton.jit
1243
+ def _fp32_to_e8m0(
1244
+ unscale,
1245
+ mbits: tl.constexpr,
1246
+ scale_round_mode: tl.constexpr,
1247
+ ):
1248
+ E8M0_EXPONENT_BIAS: tl.constexpr = 127 # type: ignore[Incompatible variable type]
1249
+ sign = tl.where(unscale < 0, -1.0, 1.0)
1250
+ abs_tensor = tl.abs(unscale)
1251
+
1252
+ # MBITS_F32 = 23
1253
+ if scale_round_mode == "even":
1254
+ val_to_add = (1 << (23 - mbits - 1)) - 1
1255
+ elif scale_round_mode == "ceil":
1256
+ val_to_add = (1 << 23) - 1
1257
+ else:
1258
+ val_to_add = 0
1259
+
1260
+ mask_exponent = ((1 << (8 + 1)) - 1) << 23
1261
+ mask_mantissa = (1 << 23) - 1
1262
+
1263
+ fp32_bits = tl.extra.cuda.libdevice.float_as_int(abs_tensor)
1264
+ fp32_bits_exp = (fp32_bits + val_to_add) & mask_exponent
1265
+ exponent = (fp32_bits_exp >> 23) & 0xFF
1266
+
1267
+ if scale_round_mode == "nv_round":
1268
+ mantissa = fp32_bits & mask_mantissa
1269
+ is_denormal = (exponent == 0) & (mantissa != 0)
1270
+ is_normal = ~is_denormal
1271
+ condition1 = is_normal & (exponent < 254) & (mantissa > 0)
1272
+ condition2 = is_denormal & (mantissa / (2**23) > 0.5)
1273
+
1274
+ exponent = tl.where(condition1 | condition2, exponent + 1, exponent)
1275
+
1276
+ exponent = exponent.to(tl.float32)
1277
+ e8m0_values = sign * tl.exp2(exponent - E8M0_EXPONENT_BIAS)
1278
+
1279
+ unscale = e8m0_values
1280
+ # In case unscale=0 (scale will be inf), or unscale=inf or nan, we set the scale to 1.0
1281
+ unscale_invalid_mask = (
1282
+ (e8m0_values == 0)
1283
+ | (e8m0_values == float("inf"))
1284
+ | (e8m0_values == float("nan"))
1285
+ )
1286
+ unscale = tl.where(unscale_invalid_mask, 1.0, unscale)
1287
+
1288
+ return unscale
1289
+
1290
+
1242
1291
  @triton.jit
1243
1292
  def _kernel_nvfp4_quantize(
1244
1293
  A,
@@ -1261,6 +1310,7 @@ def _kernel_nvfp4_quantize(
1261
1310
  GROUP_LOAD: tl.constexpr,
1262
1311
  USE_INT64: tl.constexpr,
1263
1312
  SCALE_K: tl.constexpr,
1313
+ USE_E8M0_SCALE: tl.constexpr,
1264
1314
  ) -> None:
1265
1315
  """Quantize a 1D float tensor into a packed MX4 tensor.
1266
1316
 
@@ -1282,6 +1332,8 @@ def _kernel_nvfp4_quantize(
1282
1332
  FP4_EXP_BIAS (int): Exponent bias of target mx4 format.
1283
1333
  GROUP_LOAD (int): Number of groups to process simultaneously.
1284
1334
  USE_INT64 (bool): Whether to use int64 for indexing. This is needed for large tensors.
1335
+ USE_E8M0_SCALE (bool): Whether to use E8M0 for quantization
1336
+ (set to True when we want to mimic mx4's e8m0 scaling factor in nvfp4's fp8 local scale)
1285
1337
  """
1286
1338
  # Define Constant Expressions.
1287
1339
  BF16_MIN_NORMAL: tl.constexpr = 2 ** (-126) # type: ignore[Incompatible variable type]
@@ -1347,7 +1399,12 @@ def _kernel_nvfp4_quantize(
1347
1399
  group_max = tl.max(tl.abs(a_groups), axis=1).to(tl.float32)
1348
1400
 
1349
1401
  # Next we scale A in preparation for quantization.
1350
- scale_ = (group_max / 6.0 * input_global_scale).to(tl.float8e4nv)
1402
+ if USE_E8M0_SCALE:
1403
+ scale_fp32 = group_max / 4.0 * input_global_scale
1404
+ scale_fp32 = _fp32_to_e8m0(scale_fp32, mbits=1, scale_round_mode="even")
1405
+ else:
1406
+ scale_fp32 = group_max / 6.0 * input_global_scale
1407
+ scale_ = scale_fp32.to(tl.float8e4nv)
1351
1408
  # Prevent infinite values in log.
1352
1409
  group_max = tl.where(group_max == 0, BF16_MIN_NORMAL, group_max)
1353
1410
 
@@ -1447,6 +1504,7 @@ def triton_scale_nvfp4_quant(
1447
1504
  rounding_mode: Union[RoundingMode, int] = RoundingMode.ceil,
1448
1505
  stochastic_casting: bool = False,
1449
1506
  EPS: float = 1e-5,
1507
+ use_e8m0_scale: bool = False,
1450
1508
  ) -> tuple[torch.Tensor, torch.Tensor]:
1451
1509
  """
1452
1510
  Quantize a tensor to nvfp4 format using efficient triton kernels.
@@ -1459,7 +1517,8 @@ def triton_scale_nvfp4_quant(
1459
1517
  rounding_mode (Union[RoundingMode, int]): Which type of rounding to use
1460
1518
  when calculating shared exponent. Defaults to pre-rounding to nearest even int.
1461
1519
  stochastic_casting (bool): Whether to use stochastic casting.
1462
-
1520
+ use_e8m0_scale (bool): Whether to use E8M0 for quantization
1521
+ (set to True when we want to mimic mx4's e8m0 scaling factor in nvfp4's fp8 local scale)
1463
1522
  Returns:
1464
1523
  torch.Tensor: [M / 2] nvfp4 scaled tensor packed into int8
1465
1524
  torch.Tensor: [M / group_size] nvfp4 shared exponents into int8
@@ -1567,6 +1626,8 @@ def triton_scale_nvfp4_quant(
1567
1626
  USE_INT64=use_int64,
1568
1627
  # pyre-ignore[6]
1569
1628
  SCALE_K=rounded_K,
1629
+ # pyre-ignore[6]
1630
+ USE_E8M0_SCALE=use_e8m0_scale,
1570
1631
  )
1571
1632
 
1572
1633
  scale = scale.flatten()
fbgemm_gpu/fbgemm.so CHANGED
Binary file
fbgemm_gpu/sparse_ops.py CHANGED
@@ -49,7 +49,7 @@ except Exception:
49
49
 
50
50
  import torch.utils._pytree as pytree
51
51
  from torch import SymInt, Tensor
52
- from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
52
+ from torch.fx.experimental.symbolic_shapes import guard_or_true
53
53
 
54
54
 
55
55
  if hasattr(torch.library, "register_fake"):
@@ -251,7 +251,7 @@ def tbe_input_combine_abstract(
251
251
  torch._check(index.is_contiguous())
252
252
  torch._check(offset.is_contiguous())
253
253
  total_indices = total_indices + index.numel()
254
- if guard_size_oblivious(weight.numel() > 0):
254
+ if guard_or_true(weight.numel() > 0):
255
255
  torch._check(weight.dim() == 1)
256
256
  torch._check(weight.numel() == index.numel())
257
257
  torch._check(weight.is_contiguous())
@@ -288,7 +288,7 @@ def tbe_input_combine_with_length_abstract(
288
288
  torch._check(offset.is_contiguous())
289
289
  total_indices = total_indices + index.numel()
290
290
  total_offsets = total_offsets + offset.numel()
291
- if guard_size_oblivious(weight.numel() > 0):
291
+ if guard_or_true(weight.numel() > 0):
292
292
  torch._check(weight.dim() == 1)
293
293
  torch._check(weight.numel() == index.numel())
294
294
  torch._check(weight.is_contiguous())
@@ -807,7 +807,7 @@ def batch_index_select_dim0_forward_cpu_impl_abstract(
807
807
  torch._check(num_inputs == len(input_rows))
808
808
  torch._check(num_inputs == len(input_columns))
809
809
 
810
- if permute_output_dim_0_1 and guard_size_oblivious(len(input_num_indices) > 0):
810
+ if permute_output_dim_0_1 and guard_or_true(len(input_num_indices) > 0):
811
811
  # All num_indices must be the same if permute_output_dim_0_1 is True
812
812
  for x in input_num_indices:
813
813
  torch._check(x == input_num_indices[0])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fbgemm_gpu_genai_nightly
3
- Version: 2025.10.5
3
+ Version: 2025.10.7
4
4
  Home-page: https://github.com/pytorch/fbgemm
5
5
  Author: FBGEMM Team
6
6
  Author-email: packages@pytorch.org
@@ -1,15 +1,15 @@
1
1
  fbgemm_gpu/__init__.py,sha256=FdQCmpvETH80tlIPP6W8MrOmzLaX9eoGY-fuHtVPbj0,5747
2
- fbgemm_gpu/asmjit.so,sha256=tp-5cN7HUYo7cjvR_kl_vfPBSEv78-IQxdvHN-nXFAM,501728
2
+ fbgemm_gpu/asmjit.so,sha256=yDq47YobRro7Tvd4IaPNyQUf1YaA8iLyfcwnUdh0Coo,484232
3
3
  fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
4
4
  fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
5
- fbgemm_gpu/fbgemm.so,sha256=OAAQh-pMK3NUK0QFpDpsWWAGWMNr4tOrS1zN_N6paSU,5642616
5
+ fbgemm_gpu/fbgemm.so,sha256=E4-lI4QpwkjkPmH3u1IKBUjBEDrdbL6YgeFnhIt5YKo,5811328
6
6
  fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
7
7
  fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
8
8
  fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
9
9
  fbgemm_gpu/quantize_comm.py,sha256=NqjKcQkieCrWH2HvxF8oTfzlgMA6sK9rHEUrSuCn5w4,11492
10
10
  fbgemm_gpu/quantize_utils.py,sha256=q8Aokk6nlHbXF6HcDBbhBCAGSZV4klM8uPF-MUFFtAw,8324
11
11
  fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
12
- fbgemm_gpu/sparse_ops.py,sha256=kEwe7Mev6o4RXYwZK9a0ksPgJJSSPvCkNbFwl_MTl_s,48476
12
+ fbgemm_gpu/sparse_ops.py,sha256=VYm_3f-Z-59b3gPS2aykbNI-d_HXAIvlPjtU-EL9tlY,48448
13
13
  fbgemm_gpu/split_embedding_configs.py,sha256=fv29efZGD_cvh5KwdvTFD6GZtqJLYjWXW_0vMeyT_6k,15483
14
14
  fbgemm_gpu/split_embedding_inference_converter.py,sha256=AghGW22MgMsdHzdwdPMPYDjgas5AE_estckY8rMgXVU,7056
15
15
  fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
@@ -32,18 +32,18 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
32
32
  fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
33
33
  fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
34
34
  fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
35
- fbgemm_gpu/docs/version.py,sha256=axMqnt_uxHLVuIT4M2QVOCEQgEYGPpnbD2G5jg9tAXA,316
35
+ fbgemm_gpu/docs/version.py,sha256=l0fTZZUWsGJgrEdtJbnCWLlUVNlJ1cmhFuAR4Maj8Sg,316
36
36
  fbgemm_gpu/experimental/example/__init__.py,sha256=V_XrGMq2oNVMpzwe1srlaTaHeIcZJw5oAGbo3seM_Ks,870
37
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=-IkuW8ZgEVlnqdY4NOqmY-3WmdwxrhcNPjDAWcQnLmw,243904
37
+ fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=K70etfWeSleFOhfxXvSmpZMYBn_xmpvSxgdcGenvaKo,232488
38
38
  fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
39
39
  fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=AqHefiOaN_SjP5ew7RYGuKFuSlhedOJL_6f97TtLv7c,566
40
- fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=qJmQOBa9iW-HhRYm8lzE36Lz7vpBevCS6pWQyy33pag,213404
40
+ fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=2RjIDSzUXtoFoC2ryp-C-j5H83mbSjPwvsvTrThfrqE,215658
41
41
  fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
42
42
  fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
43
43
  fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
44
44
  fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
45
45
  fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=qwfuF5E5K4oDiH7RJkpC7zth3kAsG7wv_glCl2A_G2A,1860
46
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=Nuoq1hmi6Khjn9feBvjWA23JYN0a8DltMGBcWiXiML0,78620888
46
+ fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=7wbyCShChe1DVPKlqLliGPlpqo8U5AScgXWLllN9ZWY,77952696
47
47
  fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
48
48
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=oExepXpjMOwM43gARZARY0UtR-EX2zqRnSrOaQPy448,1044
49
49
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
121
121
  fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
122
122
  list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
123
123
  list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
124
- fbgemm_gpu_genai_nightly-2025.10.5.dist-info/METADATA,sha256=rTogIn95pgowlTBehwDMPRA5MmXP09AbRW_k4y12u84,2655
125
- fbgemm_gpu_genai_nightly-2025.10.5.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
126
- fbgemm_gpu_genai_nightly-2025.10.5.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
- fbgemm_gpu_genai_nightly-2025.10.5.dist-info/RECORD,,
124
+ fbgemm_gpu_genai_nightly-2025.10.7.dist-info/METADATA,sha256=EpIY3ocq310OVN4Ma3kvReWnhF0OBb0syWTn5dY2S7M,2655
125
+ fbgemm_gpu_genai_nightly-2025.10.7.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
126
+ fbgemm_gpu_genai_nightly-2025.10.7.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
+ fbgemm_gpu_genai_nightly-2025.10.7.dist-info/RECORD,,