fbgemm-gpu-genai-nightly 2025.10.5__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.10.7__cp312-cp312-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.
- fbgemm_gpu/asmjit.so +0 -0
- fbgemm_gpu/docs/version.py +1 -1
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
- fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +63 -2
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
- fbgemm_gpu/fbgemm.so +0 -0
- fbgemm_gpu/sparse_ops.py +4 -4
- {fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/METADATA +1 -1
- {fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/RECORD +11 -11
- {fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/WHEEL +0 -0
- {fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/top_level.txt +0 -0
fbgemm_gpu/asmjit.so
CHANGED
|
Binary file
|
fbgemm_gpu/docs/version.py
CHANGED
|
Binary file
|
|
@@ -1239,6 +1239,55 @@ def triton_rms_quantize_mx4_unpack(
|
|
|
1239
1239
|
return out.view(list(orig_shape[:-1]) + [-1]), scale
|
|
1240
1240
|
|
|
1241
1241
|
|
|
1242
|
+
@triton.jit
|
|
1243
|
+
def _fp32_to_e8m0(
|
|
1244
|
+
unscale,
|
|
1245
|
+
mbits: tl.constexpr,
|
|
1246
|
+
scale_round_mode: tl.constexpr,
|
|
1247
|
+
):
|
|
1248
|
+
E8M0_EXPONENT_BIAS: tl.constexpr = 127 # type: ignore[Incompatible variable type]
|
|
1249
|
+
sign = tl.where(unscale < 0, -1.0, 1.0)
|
|
1250
|
+
abs_tensor = tl.abs(unscale)
|
|
1251
|
+
|
|
1252
|
+
# MBITS_F32 = 23
|
|
1253
|
+
if scale_round_mode == "even":
|
|
1254
|
+
val_to_add = (1 << (23 - mbits - 1)) - 1
|
|
1255
|
+
elif scale_round_mode == "ceil":
|
|
1256
|
+
val_to_add = (1 << 23) - 1
|
|
1257
|
+
else:
|
|
1258
|
+
val_to_add = 0
|
|
1259
|
+
|
|
1260
|
+
mask_exponent = ((1 << (8 + 1)) - 1) << 23
|
|
1261
|
+
mask_mantissa = (1 << 23) - 1
|
|
1262
|
+
|
|
1263
|
+
fp32_bits = tl.extra.cuda.libdevice.float_as_int(abs_tensor)
|
|
1264
|
+
fp32_bits_exp = (fp32_bits + val_to_add) & mask_exponent
|
|
1265
|
+
exponent = (fp32_bits_exp >> 23) & 0xFF
|
|
1266
|
+
|
|
1267
|
+
if scale_round_mode == "nv_round":
|
|
1268
|
+
mantissa = fp32_bits & mask_mantissa
|
|
1269
|
+
is_denormal = (exponent == 0) & (mantissa != 0)
|
|
1270
|
+
is_normal = ~is_denormal
|
|
1271
|
+
condition1 = is_normal & (exponent < 254) & (mantissa > 0)
|
|
1272
|
+
condition2 = is_denormal & (mantissa / (2**23) > 0.5)
|
|
1273
|
+
|
|
1274
|
+
exponent = tl.where(condition1 | condition2, exponent + 1, exponent)
|
|
1275
|
+
|
|
1276
|
+
exponent = exponent.to(tl.float32)
|
|
1277
|
+
e8m0_values = sign * tl.exp2(exponent - E8M0_EXPONENT_BIAS)
|
|
1278
|
+
|
|
1279
|
+
unscale = e8m0_values
|
|
1280
|
+
# In case unscale=0 (scale will be inf), or unscale=inf or nan, we set the scale to 1.0
|
|
1281
|
+
unscale_invalid_mask = (
|
|
1282
|
+
(e8m0_values == 0)
|
|
1283
|
+
| (e8m0_values == float("inf"))
|
|
1284
|
+
| (e8m0_values == float("nan"))
|
|
1285
|
+
)
|
|
1286
|
+
unscale = tl.where(unscale_invalid_mask, 1.0, unscale)
|
|
1287
|
+
|
|
1288
|
+
return unscale
|
|
1289
|
+
|
|
1290
|
+
|
|
1242
1291
|
@triton.jit
|
|
1243
1292
|
def _kernel_nvfp4_quantize(
|
|
1244
1293
|
A,
|
|
@@ -1261,6 +1310,7 @@ def _kernel_nvfp4_quantize(
|
|
|
1261
1310
|
GROUP_LOAD: tl.constexpr,
|
|
1262
1311
|
USE_INT64: tl.constexpr,
|
|
1263
1312
|
SCALE_K: tl.constexpr,
|
|
1313
|
+
USE_E8M0_SCALE: tl.constexpr,
|
|
1264
1314
|
) -> None:
|
|
1265
1315
|
"""Quantize a 1D float tensor into a packed MX4 tensor.
|
|
1266
1316
|
|
|
@@ -1282,6 +1332,8 @@ def _kernel_nvfp4_quantize(
|
|
|
1282
1332
|
FP4_EXP_BIAS (int): Exponent bias of target mx4 format.
|
|
1283
1333
|
GROUP_LOAD (int): Number of groups to process simultaneously.
|
|
1284
1334
|
USE_INT64 (bool): Whether to use int64 for indexing. This is needed for large tensors.
|
|
1335
|
+
USE_E8M0_SCALE (bool): Whether to use E8M0 for quantization
|
|
1336
|
+
(set to True when we want to mimic mx4's e8m0 scaling factor in nvfp4's fp8 local scale)
|
|
1285
1337
|
"""
|
|
1286
1338
|
# Define Constant Expressions.
|
|
1287
1339
|
BF16_MIN_NORMAL: tl.constexpr = 2 ** (-126) # type: ignore[Incompatible variable type]
|
|
@@ -1347,7 +1399,12 @@ def _kernel_nvfp4_quantize(
|
|
|
1347
1399
|
group_max = tl.max(tl.abs(a_groups), axis=1).to(tl.float32)
|
|
1348
1400
|
|
|
1349
1401
|
# Next we scale A in preparation for quantization.
|
|
1350
|
-
|
|
1402
|
+
if USE_E8M0_SCALE:
|
|
1403
|
+
scale_fp32 = group_max / 4.0 * input_global_scale
|
|
1404
|
+
scale_fp32 = _fp32_to_e8m0(scale_fp32, mbits=1, scale_round_mode="even")
|
|
1405
|
+
else:
|
|
1406
|
+
scale_fp32 = group_max / 6.0 * input_global_scale
|
|
1407
|
+
scale_ = scale_fp32.to(tl.float8e4nv)
|
|
1351
1408
|
# Prevent infinite values in log.
|
|
1352
1409
|
group_max = tl.where(group_max == 0, BF16_MIN_NORMAL, group_max)
|
|
1353
1410
|
|
|
@@ -1447,6 +1504,7 @@ def triton_scale_nvfp4_quant(
|
|
|
1447
1504
|
rounding_mode: Union[RoundingMode, int] = RoundingMode.ceil,
|
|
1448
1505
|
stochastic_casting: bool = False,
|
|
1449
1506
|
EPS: float = 1e-5,
|
|
1507
|
+
use_e8m0_scale: bool = False,
|
|
1450
1508
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
1451
1509
|
"""
|
|
1452
1510
|
Quantize a tensor to nvfp4 format using efficient triton kernels.
|
|
@@ -1459,7 +1517,8 @@ def triton_scale_nvfp4_quant(
|
|
|
1459
1517
|
rounding_mode (Union[RoundingMode, int]): Which type of rounding to use
|
|
1460
1518
|
when calculating shared exponent. Defaults to pre-rounding to nearest even int.
|
|
1461
1519
|
stochastic_casting (bool): Whether to use stochastic casting.
|
|
1462
|
-
|
|
1520
|
+
use_e8m0_scale (bool): Whether to use E8M0 for quantization
|
|
1521
|
+
(set to True when we want to mimic mx4's e8m0 scaling factor in nvfp4's fp8 local scale)
|
|
1463
1522
|
Returns:
|
|
1464
1523
|
torch.Tensor: [M / 2] nvfp4 scaled tensor packed into int8
|
|
1465
1524
|
torch.Tensor: [M / group_size] nvfp4 shared exponents into int8
|
|
@@ -1567,6 +1626,8 @@ def triton_scale_nvfp4_quant(
|
|
|
1567
1626
|
USE_INT64=use_int64,
|
|
1568
1627
|
# pyre-ignore[6]
|
|
1569
1628
|
SCALE_K=rounded_K,
|
|
1629
|
+
# pyre-ignore[6]
|
|
1630
|
+
USE_E8M0_SCALE=use_e8m0_scale,
|
|
1570
1631
|
)
|
|
1571
1632
|
|
|
1572
1633
|
scale = scale.flatten()
|
|
Binary file
|
fbgemm_gpu/fbgemm.so
CHANGED
|
Binary file
|
fbgemm_gpu/sparse_ops.py
CHANGED
|
@@ -49,7 +49,7 @@ except Exception:
|
|
|
49
49
|
|
|
50
50
|
import torch.utils._pytree as pytree
|
|
51
51
|
from torch import SymInt, Tensor
|
|
52
|
-
from torch.fx.experimental.symbolic_shapes import
|
|
52
|
+
from torch.fx.experimental.symbolic_shapes import guard_or_true
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
if hasattr(torch.library, "register_fake"):
|
|
@@ -251,7 +251,7 @@ def tbe_input_combine_abstract(
|
|
|
251
251
|
torch._check(index.is_contiguous())
|
|
252
252
|
torch._check(offset.is_contiguous())
|
|
253
253
|
total_indices = total_indices + index.numel()
|
|
254
|
-
if
|
|
254
|
+
if guard_or_true(weight.numel() > 0):
|
|
255
255
|
torch._check(weight.dim() == 1)
|
|
256
256
|
torch._check(weight.numel() == index.numel())
|
|
257
257
|
torch._check(weight.is_contiguous())
|
|
@@ -288,7 +288,7 @@ def tbe_input_combine_with_length_abstract(
|
|
|
288
288
|
torch._check(offset.is_contiguous())
|
|
289
289
|
total_indices = total_indices + index.numel()
|
|
290
290
|
total_offsets = total_offsets + offset.numel()
|
|
291
|
-
if
|
|
291
|
+
if guard_or_true(weight.numel() > 0):
|
|
292
292
|
torch._check(weight.dim() == 1)
|
|
293
293
|
torch._check(weight.numel() == index.numel())
|
|
294
294
|
torch._check(weight.is_contiguous())
|
|
@@ -807,7 +807,7 @@ def batch_index_select_dim0_forward_cpu_impl_abstract(
|
|
|
807
807
|
torch._check(num_inputs == len(input_rows))
|
|
808
808
|
torch._check(num_inputs == len(input_columns))
|
|
809
809
|
|
|
810
|
-
if permute_output_dim_0_1 and
|
|
810
|
+
if permute_output_dim_0_1 and guard_or_true(len(input_num_indices) > 0):
|
|
811
811
|
# All num_indices must be the same if permute_output_dim_0_1 is True
|
|
812
812
|
for x in input_num_indices:
|
|
813
813
|
torch._check(x == input_num_indices[0])
|
{fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/RECORD
RENAMED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
fbgemm_gpu/__init__.py,sha256=FdQCmpvETH80tlIPP6W8MrOmzLaX9eoGY-fuHtVPbj0,5747
|
|
2
|
-
fbgemm_gpu/asmjit.so,sha256=
|
|
2
|
+
fbgemm_gpu/asmjit.so,sha256=yDq47YobRro7Tvd4IaPNyQUf1YaA8iLyfcwnUdh0Coo,484232
|
|
3
3
|
fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
|
|
4
4
|
fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
|
|
5
|
-
fbgemm_gpu/fbgemm.so,sha256=
|
|
5
|
+
fbgemm_gpu/fbgemm.so,sha256=E4-lI4QpwkjkPmH3u1IKBUjBEDrdbL6YgeFnhIt5YKo,5811328
|
|
6
6
|
fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
|
|
7
7
|
fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
|
|
8
8
|
fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
|
|
9
9
|
fbgemm_gpu/quantize_comm.py,sha256=NqjKcQkieCrWH2HvxF8oTfzlgMA6sK9rHEUrSuCn5w4,11492
|
|
10
10
|
fbgemm_gpu/quantize_utils.py,sha256=q8Aokk6nlHbXF6HcDBbhBCAGSZV4klM8uPF-MUFFtAw,8324
|
|
11
11
|
fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
|
|
12
|
-
fbgemm_gpu/sparse_ops.py,sha256=
|
|
12
|
+
fbgemm_gpu/sparse_ops.py,sha256=VYm_3f-Z-59b3gPS2aykbNI-d_HXAIvlPjtU-EL9tlY,48448
|
|
13
13
|
fbgemm_gpu/split_embedding_configs.py,sha256=fv29efZGD_cvh5KwdvTFD6GZtqJLYjWXW_0vMeyT_6k,15483
|
|
14
14
|
fbgemm_gpu/split_embedding_inference_converter.py,sha256=AghGW22MgMsdHzdwdPMPYDjgas5AE_estckY8rMgXVU,7056
|
|
15
15
|
fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
|
|
@@ -32,18 +32,18 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
|
|
|
32
32
|
fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
|
|
33
33
|
fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
|
|
34
34
|
fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
|
|
35
|
-
fbgemm_gpu/docs/version.py,sha256=
|
|
35
|
+
fbgemm_gpu/docs/version.py,sha256=l0fTZZUWsGJgrEdtJbnCWLlUVNlJ1cmhFuAR4Maj8Sg,316
|
|
36
36
|
fbgemm_gpu/experimental/example/__init__.py,sha256=V_XrGMq2oNVMpzwe1srlaTaHeIcZJw5oAGbo3seM_Ks,870
|
|
37
|
-
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256
|
|
37
|
+
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=K70etfWeSleFOhfxXvSmpZMYBn_xmpvSxgdcGenvaKo,232488
|
|
38
38
|
fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
|
|
39
39
|
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=AqHefiOaN_SjP5ew7RYGuKFuSlhedOJL_6f97TtLv7c,566
|
|
40
|
-
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=
|
|
40
|
+
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=2RjIDSzUXtoFoC2ryp-C-j5H83mbSjPwvsvTrThfrqE,215658
|
|
41
41
|
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
|
|
42
42
|
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
|
|
43
43
|
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
|
|
44
44
|
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
|
|
45
45
|
fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=qwfuF5E5K4oDiH7RJkpC7zth3kAsG7wv_glCl2A_G2A,1860
|
|
46
|
-
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=
|
|
46
|
+
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=7wbyCShChe1DVPKlqLliGPlpqo8U5AScgXWLllN9ZWY,77952696
|
|
47
47
|
fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
|
|
48
48
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=oExepXpjMOwM43gARZARY0UtR-EX2zqRnSrOaQPy448,1044
|
|
49
49
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
|
|
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
|
|
|
121
121
|
fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
|
|
122
122
|
list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
|
|
123
123
|
list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
|
|
124
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
125
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
126
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
127
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
124
|
+
fbgemm_gpu_genai_nightly-2025.10.7.dist-info/METADATA,sha256=EpIY3ocq310OVN4Ma3kvReWnhF0OBb0syWTn5dY2S7M,2655
|
|
125
|
+
fbgemm_gpu_genai_nightly-2025.10.7.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
|
|
126
|
+
fbgemm_gpu_genai_nightly-2025.10.7.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
|
|
127
|
+
fbgemm_gpu_genai_nightly-2025.10.7.dist-info/RECORD,,
|
{fbgemm_gpu_genai_nightly-2025.10.5.dist-info → fbgemm_gpu_genai_nightly-2025.10.7.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|