fbgemm-gpu-genai-nightly 2025.10.2__cp310-cp310-manylinux_2_28_x86_64.whl → 2025.10.5__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fbgemm_gpu/asmjit.so +0 -0
- fbgemm_gpu/config/feature_list.py +1 -1
- fbgemm_gpu/docs/version.py +1 -1
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
- fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +50 -0
- fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +336 -3
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
- fbgemm_gpu/fbgemm.so +0 -0
- {fbgemm_gpu_genai_nightly-2025.10.2.dist-info → fbgemm_gpu_genai_nightly-2025.10.5.dist-info}/METADATA +1 -1
- {fbgemm_gpu_genai_nightly-2025.10.2.dist-info → fbgemm_gpu_genai_nightly-2025.10.5.dist-info}/RECORD +12 -12
- {fbgemm_gpu_genai_nightly-2025.10.2.dist-info → fbgemm_gpu_genai_nightly-2025.10.5.dist-info}/WHEEL +0 -0
- {fbgemm_gpu_genai_nightly-2025.10.2.dist-info → fbgemm_gpu_genai_nightly-2025.10.5.dist-info}/top_level.txt +0 -0
fbgemm_gpu/asmjit.so
CHANGED
|
Binary file
|
|
@@ -11,7 +11,7 @@ from enum import auto, Enum
|
|
|
11
11
|
import torch
|
|
12
12
|
|
|
13
13
|
try:
|
|
14
|
-
torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:
|
|
14
|
+
torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp_torch_op")
|
|
15
15
|
except Exception:
|
|
16
16
|
import fbgemm_gpu # noqa F401
|
|
17
17
|
|
fbgemm_gpu/docs/version.py
CHANGED
|
Binary file
|
|
@@ -5540,3 +5540,53 @@ def calculate_group_max(
|
|
|
5540
5540
|
USE_INT64=use_int64,
|
|
5541
5541
|
)
|
|
5542
5542
|
return out, tensor_idx
|
|
5543
|
+
|
|
5544
|
+
|
|
5545
|
+
def get_nvfp4_global_scales_naive(
|
|
5546
|
+
xs: list[torch.Tensor], ws: list[torch.Tensor]
|
|
5547
|
+
) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
|
|
5548
|
+
"""
|
|
5549
|
+
Get global scales for each tensor in xs and ws.
|
|
5550
|
+
This is done "naively" (not efficiently with a kernel). This function is used in unit tests or debugging.
|
|
5551
|
+
"""
|
|
5552
|
+
global_scales = []
|
|
5553
|
+
x_global_scales = []
|
|
5554
|
+
w_global_scales = []
|
|
5555
|
+
|
|
5556
|
+
for x, w in zip(xs, ws):
|
|
5557
|
+
# pyre-ignore
|
|
5558
|
+
x_global_scale: torch.Tensor = (448.0 * 6.0) / torch.amax(
|
|
5559
|
+
torch.abs(x.flatten()), dim=-1
|
|
5560
|
+
).to(torch.float32)
|
|
5561
|
+
# pyre-ignore
|
|
5562
|
+
w_global_scale: torch.Tensor = (448.0 * 6.0) / torch.amax(
|
|
5563
|
+
torch.abs(w.flatten()), dim=-1
|
|
5564
|
+
).to(torch.float32)
|
|
5565
|
+
# pyre-ignore
|
|
5566
|
+
global_scale: torch.Tensor = 1 / (x_global_scale * w_global_scale)
|
|
5567
|
+
|
|
5568
|
+
global_scales.append(global_scale)
|
|
5569
|
+
x_global_scales.append(x_global_scale)
|
|
5570
|
+
w_global_scales.append(w_global_scale)
|
|
5571
|
+
|
|
5572
|
+
return global_scales, x_global_scales, w_global_scales
|
|
5573
|
+
|
|
5574
|
+
|
|
5575
|
+
def quantize_nvfp4_naive(
|
|
5576
|
+
xs: list[torch.Tensor], global_scales: list[torch.Tensor]
|
|
5577
|
+
) -> tuple[
|
|
5578
|
+
list[torch.Tensor],
|
|
5579
|
+
list[torch.Tensor],
|
|
5580
|
+
]:
|
|
5581
|
+
"""
|
|
5582
|
+
Quantize A to NVFP4 format.
|
|
5583
|
+
This is done "naively" using a kernel for each group. This function is largely used in unit tests or debugging.
|
|
5584
|
+
"""
|
|
5585
|
+
xqs, x_scales = zip(
|
|
5586
|
+
*(
|
|
5587
|
+
triton_scale_nvfp4_quant(x, global_scale)
|
|
5588
|
+
for x, global_scale in zip(xs, global_scales)
|
|
5589
|
+
)
|
|
5590
|
+
)
|
|
5591
|
+
|
|
5592
|
+
return xqs, x_scales
|
|
@@ -16,9 +16,11 @@ import triton # @manual=//triton:triton
|
|
|
16
16
|
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
|
|
17
17
|
_to_blocked,
|
|
18
18
|
calculate_group_max,
|
|
19
|
+
get_nvfp4_global_scales_naive,
|
|
19
20
|
mega_fp4_pack,
|
|
20
21
|
mega_fp4_quantize_kernel,
|
|
21
22
|
mega_fp4_unpack,
|
|
23
|
+
quantize_nvfp4_naive,
|
|
22
24
|
triton_quantize_mx4_unpack,
|
|
23
25
|
triton_scale_nvfp4_quant,
|
|
24
26
|
triton_scale_nvfp4_quant_rms,
|
|
@@ -96,6 +98,10 @@ except ImportError:
|
|
|
96
98
|
quantize_op_registry = []
|
|
97
99
|
|
|
98
100
|
|
|
101
|
+
def round_up(x: int, y: int) -> int:
|
|
102
|
+
return ((x + y - 1) // y) * y
|
|
103
|
+
|
|
104
|
+
|
|
99
105
|
class QuantizeOpBase(metaclass=abc.ABCMeta):
|
|
100
106
|
"""Helper abstract class to define expected methods of quantize ops."""
|
|
101
107
|
|
|
@@ -2592,6 +2598,336 @@ class MXFP4StackedGroupedGemm(QuantizeOpBase):
|
|
|
2592
2598
|
return True
|
|
2593
2599
|
|
|
2594
2600
|
|
|
2601
|
+
@register_quantize_op
|
|
2602
|
+
class MXFP4GroupedGemm2D3D(QuantizeOpBase):
|
|
2603
|
+
"""
|
|
2604
|
+
MXFP4 grouped GEMM with blockwise scaling and Torch 2D2D API.
|
|
2605
|
+
"""
|
|
2606
|
+
|
|
2607
|
+
def preprocess(self, xs, ws):
|
|
2608
|
+
m_sizes = [x.shape[0] for x in xs]
|
|
2609
|
+
m_offsets = torch.cumsum(torch.tensor(m_sizes), dim=0).to(
|
|
2610
|
+
dtype=torch.int32, device=xs[0].device
|
|
2611
|
+
)
|
|
2612
|
+
|
|
2613
|
+
wqs = []
|
|
2614
|
+
w_scales = []
|
|
2615
|
+
for w in ws:
|
|
2616
|
+
wq, w_scale = triton_quantize_mx4_unpack(w)
|
|
2617
|
+
wqs.append(wq)
|
|
2618
|
+
w_scales.append(w_scale)
|
|
2619
|
+
|
|
2620
|
+
wq = torch.stack(wqs, dim=0)
|
|
2621
|
+
w_scale = torch.stack(w_scales, dim=0)
|
|
2622
|
+
|
|
2623
|
+
return xs, wq, w_scale, m_offsets
|
|
2624
|
+
|
|
2625
|
+
def quantize(self, xs, wq, w_scale, m_offsets):
|
|
2626
|
+
xqs = []
|
|
2627
|
+
x_scales = []
|
|
2628
|
+
for x in xs:
|
|
2629
|
+
xq, x_scale = triton_quantize_mx4_unpack(x)
|
|
2630
|
+
xqs.append(xq)
|
|
2631
|
+
x_scales.append(x_scale)
|
|
2632
|
+
|
|
2633
|
+
xq = torch.cat(xqs, dim=0)
|
|
2634
|
+
x_scale = torch.stack(x_scales, dim=0)
|
|
2635
|
+
|
|
2636
|
+
xq = xq.view(torch.float4_e2m1fn_x2)
|
|
2637
|
+
wq = wq.view(torch.float4_e2m1fn_x2)
|
|
2638
|
+
x_scale = x_scale.view(torch.float8_e8m0fnu)
|
|
2639
|
+
w_scale = w_scale.view(torch.float8_e8m0fnu)
|
|
2640
|
+
|
|
2641
|
+
return xq, wq, x_scale, w_scale, m_offsets
|
|
2642
|
+
|
|
2643
|
+
def compute(
|
|
2644
|
+
self,
|
|
2645
|
+
xq,
|
|
2646
|
+
wq,
|
|
2647
|
+
x_scale,
|
|
2648
|
+
w_scale,
|
|
2649
|
+
m_offsets,
|
|
2650
|
+
):
|
|
2651
|
+
return torch.ops.fbgemm.f4f4bf16_grouped_mm(
|
|
2652
|
+
xq,
|
|
2653
|
+
wq.transpose(-2, -1),
|
|
2654
|
+
x_scale,
|
|
2655
|
+
w_scale,
|
|
2656
|
+
m_offsets,
|
|
2657
|
+
)
|
|
2658
|
+
|
|
2659
|
+
def quantize_and_compute(self, xs, wq, w_scale, m_offsets, output):
|
|
2660
|
+
args = self.quantize(xs, wq, w_scale, m_offsets, output)
|
|
2661
|
+
return self.compute(**args)
|
|
2662
|
+
|
|
2663
|
+
@property
|
|
2664
|
+
def name(self) -> str:
|
|
2665
|
+
return "cutlass_mx_f4f4bf16_grouped_mm_2d_3d"
|
|
2666
|
+
|
|
2667
|
+
@property
|
|
2668
|
+
def cuda(self) -> bool:
|
|
2669
|
+
return True
|
|
2670
|
+
|
|
2671
|
+
@property
|
|
2672
|
+
def hip(self) -> bool:
|
|
2673
|
+
return False
|
|
2674
|
+
|
|
2675
|
+
|
|
2676
|
+
@register_quantize_op
|
|
2677
|
+
class MXFP4GroupedGemm2D2D(QuantizeOpBase):
|
|
2678
|
+
"""
|
|
2679
|
+
MXFP4 grouped GEMM with blockwise scaling and Torch 2D2D API.
|
|
2680
|
+
"""
|
|
2681
|
+
|
|
2682
|
+
def preprocess(self, xs, ws):
|
|
2683
|
+
k_sizes = [x.shape[1] for x in xs]
|
|
2684
|
+
k_offsets = torch.cumsum(torch.tensor(k_sizes), dim=0).to(
|
|
2685
|
+
dtype=torch.int32, device=xs[0].device
|
|
2686
|
+
)
|
|
2687
|
+
|
|
2688
|
+
wqs = []
|
|
2689
|
+
w_scales = []
|
|
2690
|
+
for w in ws:
|
|
2691
|
+
wq, w_scale = triton_quantize_mx4_unpack(w)
|
|
2692
|
+
wqs.append(wq)
|
|
2693
|
+
w_scales.append(w_scale)
|
|
2694
|
+
|
|
2695
|
+
wq = torch.cat(wqs, dim=1)
|
|
2696
|
+
w_scale = torch.stack(w_scales, dim=0)
|
|
2697
|
+
|
|
2698
|
+
return xs, wq, w_scale, k_offsets
|
|
2699
|
+
|
|
2700
|
+
def quantize(self, xs, wq, w_scale, k_offsets):
|
|
2701
|
+
xqs = []
|
|
2702
|
+
x_scales = []
|
|
2703
|
+
for x in xs:
|
|
2704
|
+
xq, x_scale = triton_quantize_mx4_unpack(x)
|
|
2705
|
+
xqs.append(xq)
|
|
2706
|
+
x_scales.append(x_scale)
|
|
2707
|
+
|
|
2708
|
+
xq = torch.cat(xqs, dim=1)
|
|
2709
|
+
x_scale = torch.stack(x_scales, dim=0)
|
|
2710
|
+
|
|
2711
|
+
xq = xq.view(torch.float4_e2m1fn_x2)
|
|
2712
|
+
wq = wq.view(torch.float4_e2m1fn_x2)
|
|
2713
|
+
x_scale = x_scale.view(torch.float8_e8m0fnu)
|
|
2714
|
+
w_scale = w_scale.view(torch.float8_e8m0fnu)
|
|
2715
|
+
|
|
2716
|
+
return xq, wq, x_scale, w_scale, k_offsets
|
|
2717
|
+
|
|
2718
|
+
def compute(
|
|
2719
|
+
self,
|
|
2720
|
+
xq,
|
|
2721
|
+
wq,
|
|
2722
|
+
x_scale,
|
|
2723
|
+
w_scale,
|
|
2724
|
+
k_offsets,
|
|
2725
|
+
):
|
|
2726
|
+
return torch.ops.fbgemm.f4f4bf16_grouped_mm(
|
|
2727
|
+
xq,
|
|
2728
|
+
wq.transpose(-2, -1),
|
|
2729
|
+
x_scale,
|
|
2730
|
+
w_scale,
|
|
2731
|
+
k_offsets,
|
|
2732
|
+
)
|
|
2733
|
+
|
|
2734
|
+
def quantize_and_compute(self, xs, wq, w_scale, k_offsets, output):
|
|
2735
|
+
args = self.quantize(xs, wq, w_scale, k_offsets, output)
|
|
2736
|
+
return self.compute(**args)
|
|
2737
|
+
|
|
2738
|
+
@property
|
|
2739
|
+
def name(self) -> str:
|
|
2740
|
+
return "cutlass_mx_f4f4bf16_grouped_mm_2d_2d"
|
|
2741
|
+
|
|
2742
|
+
@property
|
|
2743
|
+
def cuda(self) -> bool:
|
|
2744
|
+
return True
|
|
2745
|
+
|
|
2746
|
+
@property
|
|
2747
|
+
def hip(self) -> bool:
|
|
2748
|
+
return False
|
|
2749
|
+
|
|
2750
|
+
|
|
2751
|
+
@register_quantize_op
|
|
2752
|
+
class NVFP4GroupedGemm2D3D(QuantizeOpBase):
|
|
2753
|
+
"""
|
|
2754
|
+
NVFP4 grouped GEMM with blockwise scaling and Torch 2D3D API.
|
|
2755
|
+
"""
|
|
2756
|
+
|
|
2757
|
+
def preprocess(self, x, w):
|
|
2758
|
+
m_values = [i.shape[0] for i in x]
|
|
2759
|
+
m_sizes = torch.tensor(m_values).to(dtype=torch.int64, device=x[0].device)
|
|
2760
|
+
x = torch.concat(x, dim=0).contiguous()
|
|
2761
|
+
|
|
2762
|
+
def get_global_scale(x, w, m_sizes):
|
|
2763
|
+
G = len(w)
|
|
2764
|
+
w_global_scale = []
|
|
2765
|
+
global_scale = []
|
|
2766
|
+
|
|
2767
|
+
x_global_scale, tensor_idx = calculate_group_max(x, m_sizes=m_sizes)
|
|
2768
|
+
|
|
2769
|
+
for i in range(G):
|
|
2770
|
+
w_global_scale_ = (448.0 * 6.0) / torch.amax(
|
|
2771
|
+
torch.abs(w[i].flatten()), dim=-1
|
|
2772
|
+
).to(torch.float32)
|
|
2773
|
+
|
|
2774
|
+
global_scale_ = 1 / (x_global_scale[i] * w_global_scale_)
|
|
2775
|
+
|
|
2776
|
+
w_global_scale.append(w_global_scale_)
|
|
2777
|
+
global_scale.append(global_scale_)
|
|
2778
|
+
|
|
2779
|
+
return x_global_scale, w_global_scale, global_scale, tensor_idx
|
|
2780
|
+
|
|
2781
|
+
# Compute global scale for each group
|
|
2782
|
+
G = m_sizes.numel()
|
|
2783
|
+
x_global_scale, w_global_scale, global_scale, tensor_idx = get_global_scale(
|
|
2784
|
+
x, w, m_sizes
|
|
2785
|
+
)
|
|
2786
|
+
global_scale = torch.stack(global_scale, dim=0).contiguous()
|
|
2787
|
+
|
|
2788
|
+
wq, w_scale = zip(
|
|
2789
|
+
*[triton_scale_nvfp4_quant(w[i], w_global_scale[i]) for i in range(G)]
|
|
2790
|
+
)
|
|
2791
|
+
wq = torch.stack(wq, dim=0).contiguous()
|
|
2792
|
+
w_scale = torch.stack(w_scale, dim=0).contiguous()
|
|
2793
|
+
|
|
2794
|
+
return x, wq, w_scale, x_global_scale, global_scale, m_sizes, tensor_idx
|
|
2795
|
+
|
|
2796
|
+
def quantize(
|
|
2797
|
+
self, x, wq, w_scale, x_global_scale, global_scale, m_sizes, tensor_idx
|
|
2798
|
+
):
|
|
2799
|
+
xq, x_scale, _ = mega_fp4_quantize_kernel(
|
|
2800
|
+
m_sizes, x, x_global_scale, optional_tensor_idx=tensor_idx
|
|
2801
|
+
)
|
|
2802
|
+
|
|
2803
|
+
x_scale = x_scale.reshape(-1, x.shape[1] // 16)
|
|
2804
|
+
offsets = torch.cumsum(m_sizes, dim=0).to(torch.int32)
|
|
2805
|
+
|
|
2806
|
+
xq = xq.view(torch.float4_e2m1fn_x2)
|
|
2807
|
+
wq = wq.view(torch.float4_e2m1fn_x2)
|
|
2808
|
+
x_scale = x_scale.view(torch.float8_e4m3fn)
|
|
2809
|
+
w_scale = w_scale.view(torch.float8_e4m3fn)
|
|
2810
|
+
|
|
2811
|
+
return (
|
|
2812
|
+
xq,
|
|
2813
|
+
wq.transpose(-2, -1),
|
|
2814
|
+
x_scale,
|
|
2815
|
+
w_scale,
|
|
2816
|
+
offsets,
|
|
2817
|
+
None,
|
|
2818
|
+
global_scale,
|
|
2819
|
+
)
|
|
2820
|
+
|
|
2821
|
+
def compute(
|
|
2822
|
+
self,
|
|
2823
|
+
xq,
|
|
2824
|
+
wq,
|
|
2825
|
+
x_scale,
|
|
2826
|
+
w_scale,
|
|
2827
|
+
offsets,
|
|
2828
|
+
output,
|
|
2829
|
+
global_scale,
|
|
2830
|
+
):
|
|
2831
|
+
return torch.ops.fbgemm.f4f4bf16_grouped_mm(
|
|
2832
|
+
xq,
|
|
2833
|
+
wq,
|
|
2834
|
+
x_scale,
|
|
2835
|
+
w_scale,
|
|
2836
|
+
offsets,
|
|
2837
|
+
output,
|
|
2838
|
+
global_scale,
|
|
2839
|
+
)
|
|
2840
|
+
|
|
2841
|
+
def quantize_and_compute(self, xq, wq, x_scale, w_scale, global_scale, k_offsets):
|
|
2842
|
+
args = self.quantize(xq, wq, x_scale, w_scale, global_scale, k_offsets)
|
|
2843
|
+
return self.compute(**args)
|
|
2844
|
+
|
|
2845
|
+
@property
|
|
2846
|
+
def name(self) -> str:
|
|
2847
|
+
return "cutlass_nv_f4f4bf16_grouped_mm_2d_3d"
|
|
2848
|
+
|
|
2849
|
+
@property
|
|
2850
|
+
def hip(self) -> bool:
|
|
2851
|
+
return False
|
|
2852
|
+
|
|
2853
|
+
@property
|
|
2854
|
+
def cuda(self) -> bool:
|
|
2855
|
+
return True
|
|
2856
|
+
|
|
2857
|
+
|
|
2858
|
+
@register_quantize_op
|
|
2859
|
+
class NVFP4GroupedGemm2D2D(QuantizeOpBase):
|
|
2860
|
+
"""
|
|
2861
|
+
NVFP4 grouped GEMM with blockwise scaling and Torch 2D2D API.
|
|
2862
|
+
"""
|
|
2863
|
+
|
|
2864
|
+
def preprocess(self, xs, ws):
|
|
2865
|
+
k_sizes = [x.shape[1] for x in xs]
|
|
2866
|
+
k_offsets = torch.cumsum(torch.tensor(k_sizes), dim=0).to(
|
|
2867
|
+
dtype=torch.int32, device=xs[0].device
|
|
2868
|
+
)
|
|
2869
|
+
|
|
2870
|
+
global_scales, x_global_scales, w_global_scales = get_nvfp4_global_scales_naive(
|
|
2871
|
+
xs, ws
|
|
2872
|
+
)
|
|
2873
|
+
wqs, w_scales = quantize_nvfp4_naive(ws, w_global_scales)
|
|
2874
|
+
wq = torch.cat(wqs, dim=1).view(torch.float4_e2m1fn_x2)
|
|
2875
|
+
w_scale = (
|
|
2876
|
+
torch.stack(w_scales, dim=0)
|
|
2877
|
+
.reshape(round_up(wq.size(0), 128), -1)
|
|
2878
|
+
.view(torch.float8_e4m3fn)
|
|
2879
|
+
)
|
|
2880
|
+
global_scale = torch.stack(global_scales, dim=0)
|
|
2881
|
+
|
|
2882
|
+
return xs, wq, w_scale, global_scale, x_global_scales, k_offsets
|
|
2883
|
+
|
|
2884
|
+
def quantize(self, xs, wq, w_scale, global_scale, x_global_scales, k_offsets):
|
|
2885
|
+
xqs, x_scales = quantize_nvfp4_naive(xs, x_global_scales)
|
|
2886
|
+
xq = torch.cat(xqs, dim=1).view(torch.float4_e2m1fn_x2)
|
|
2887
|
+
x_scale = (
|
|
2888
|
+
torch.stack(x_scales, dim=0)
|
|
2889
|
+
.reshape(round_up(xq.size(0), 128), -1)
|
|
2890
|
+
.view(torch.float8_e4m3fn)
|
|
2891
|
+
)
|
|
2892
|
+
|
|
2893
|
+
return xq, wq, x_scale, w_scale, k_offsets, global_scale
|
|
2894
|
+
|
|
2895
|
+
def compute(
|
|
2896
|
+
self,
|
|
2897
|
+
xq,
|
|
2898
|
+
wq,
|
|
2899
|
+
x_scale,
|
|
2900
|
+
w_scale,
|
|
2901
|
+
k_offsets,
|
|
2902
|
+
global_scale,
|
|
2903
|
+
):
|
|
2904
|
+
return torch.ops.fbgemm.f4f4bf16_grouped_mm(
|
|
2905
|
+
xq,
|
|
2906
|
+
wq.transpose(-2, -1),
|
|
2907
|
+
x_scale,
|
|
2908
|
+
w_scale,
|
|
2909
|
+
k_offsets,
|
|
2910
|
+
None,
|
|
2911
|
+
global_scale,
|
|
2912
|
+
)
|
|
2913
|
+
|
|
2914
|
+
def quantize_and_compute(self, xq, wq, x_scale, w_scale, global_scale, k_offsets):
|
|
2915
|
+
args = self.quantize(xq, wq, x_scale, w_scale, global_scale, k_offsets)
|
|
2916
|
+
return self.compute(**args)
|
|
2917
|
+
|
|
2918
|
+
@property
|
|
2919
|
+
def name(self) -> str:
|
|
2920
|
+
return "cutlass_nv_f4f4bf16_grouped_mm_2d_2d"
|
|
2921
|
+
|
|
2922
|
+
@property
|
|
2923
|
+
def hip(self) -> bool:
|
|
2924
|
+
return False
|
|
2925
|
+
|
|
2926
|
+
@property
|
|
2927
|
+
def cuda(self) -> bool:
|
|
2928
|
+
return True
|
|
2929
|
+
|
|
2930
|
+
|
|
2595
2931
|
@register_quantize_op
|
|
2596
2932
|
class NVFP4StackedGroupedGemm(QuantizeOpBase):
|
|
2597
2933
|
"""
|
|
@@ -3064,9 +3400,6 @@ class MXFP8GroupedGemm2d2d(QuantizeOpBase):
|
|
|
3064
3400
|
x_blocked_scale_list = []
|
|
3065
3401
|
w_blocked_scale_list = []
|
|
3066
3402
|
|
|
3067
|
-
def round_up(x: int, y: int) -> int:
|
|
3068
|
-
return ((x + y - 1) // y) * y
|
|
3069
|
-
|
|
3070
3403
|
for group_idx in range(G):
|
|
3071
3404
|
# to_mxfp8 per group
|
|
3072
3405
|
prev_group_end_offset = (
|
|
Binary file
|
fbgemm_gpu/fbgemm.so
CHANGED
|
Binary file
|
{fbgemm_gpu_genai_nightly-2025.10.2.dist-info → fbgemm_gpu_genai_nightly-2025.10.5.dist-info}/RECORD
RENAMED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
fbgemm_gpu/__init__.py,sha256=FdQCmpvETH80tlIPP6W8MrOmzLaX9eoGY-fuHtVPbj0,5747
|
|
2
|
-
fbgemm_gpu/asmjit.so,sha256=
|
|
2
|
+
fbgemm_gpu/asmjit.so,sha256=5N9owjQJb6tzArT7Lzjl_gIEjn5eDTvOvjxwLEVTaV4,484232
|
|
3
3
|
fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
|
|
4
4
|
fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
|
|
5
|
-
fbgemm_gpu/fbgemm.so,sha256=
|
|
5
|
+
fbgemm_gpu/fbgemm.so,sha256=E4-lI4QpwkjkPmH3u1IKBUjBEDrdbL6YgeFnhIt5YKo,5811328
|
|
6
6
|
fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
|
|
7
7
|
fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
|
|
8
8
|
fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
|
|
@@ -23,7 +23,7 @@ fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc
|
|
|
23
23
|
fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
|
|
24
24
|
fbgemm_gpu/uvm.py,sha256=guNK8ZzR80jmv-CyRgEhxhVYhjz3R9d6tB8Hu1uWDUo,1047
|
|
25
25
|
fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
|
|
26
|
-
fbgemm_gpu/config/feature_list.py,sha256=
|
|
26
|
+
fbgemm_gpu/config/feature_list.py,sha256=iDOGr9nwTqUhWsqOefRIqIo1jwLSeSII4jGnLeU01kg,2359
|
|
27
27
|
fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
|
|
28
28
|
fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
|
|
29
29
|
fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
|
|
@@ -32,18 +32,18 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
|
|
|
32
32
|
fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
|
|
33
33
|
fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
|
|
34
34
|
fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
|
|
35
|
-
fbgemm_gpu/docs/version.py,sha256=
|
|
35
|
+
fbgemm_gpu/docs/version.py,sha256=axMqnt_uxHLVuIT4M2QVOCEQgEYGPpnbD2G5jg9tAXA,316
|
|
36
36
|
fbgemm_gpu/experimental/example/__init__.py,sha256=V_XrGMq2oNVMpzwe1srlaTaHeIcZJw5oAGbo3seM_Ks,870
|
|
37
|
-
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=
|
|
37
|
+
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=vUe4mdTJLEcR2yxamKqjTN7K7vCWkTFlGcdcfxCU56c,232488
|
|
38
38
|
fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
|
|
39
39
|
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=AqHefiOaN_SjP5ew7RYGuKFuSlhedOJL_6f97TtLv7c,566
|
|
40
|
-
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=
|
|
40
|
+
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=qJmQOBa9iW-HhRYm8lzE36Lz7vpBevCS6pWQyy33pag,213404
|
|
41
41
|
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
|
|
42
42
|
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
|
|
43
43
|
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
|
|
44
44
|
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
|
|
45
45
|
fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=qwfuF5E5K4oDiH7RJkpC7zth3kAsG7wv_glCl2A_G2A,1860
|
|
46
|
-
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=
|
|
46
|
+
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=i68jA82SlC56vMpmgv1uI196AaQzS4QHCh1V4P67_H8,77952696
|
|
47
47
|
fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
|
|
48
48
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=oExepXpjMOwM43gARZARY0UtR-EX2zqRnSrOaQPy448,1044
|
|
49
49
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
|
|
@@ -52,7 +52,7 @@ fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=GvCUF6o7wCR3XSWingWKxn_Y
|
|
|
52
52
|
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
|
|
53
53
|
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
|
|
54
54
|
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py,sha256=BWl6t-4acbuRSEX2aVNDlFrSWZkqMWK2sI3VONaMd3Q,24047
|
|
55
|
-
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=
|
|
55
|
+
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=H6AchejyZs76_snM_ae5vV0cPr_Q0h35OQ8qED0r1N4,104915
|
|
56
56
|
fbgemm_gpu/experimental/gen_ai/moe/README.md,sha256=z9ybHmv4KFJ1drj5OByuFaOY0tRQwwiIW3Q22TB_2-k,904
|
|
57
57
|
fbgemm_gpu/experimental/gen_ai/moe/__init__.py,sha256=XzFeGAmhJ-QvsUyzmlRZWExvaVJ1CbBk3ENo-LK3KHk,2052
|
|
58
58
|
fbgemm_gpu/experimental/gen_ai/moe/activation.py,sha256=NiXhWyCNagI3P9N3N89iSX7xKuShdkq9DxEUAzoV6y0,7892
|
|
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
|
|
|
121
121
|
fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
|
|
122
122
|
list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
|
|
123
123
|
list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
|
|
124
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
125
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
126
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
127
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
124
|
+
fbgemm_gpu_genai_nightly-2025.10.5.dist-info/METADATA,sha256=rTogIn95pgowlTBehwDMPRA5MmXP09AbRW_k4y12u84,2655
|
|
125
|
+
fbgemm_gpu_genai_nightly-2025.10.5.dist-info/WHEEL,sha256=k9CVMKlTmOLLXq_OyiiJFbPd6UKfogV4yIUezgPmplE,108
|
|
126
|
+
fbgemm_gpu_genai_nightly-2025.10.5.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
|
|
127
|
+
fbgemm_gpu_genai_nightly-2025.10.5.dist-info/RECORD,,
|
{fbgemm_gpu_genai_nightly-2025.10.2.dist-info → fbgemm_gpu_genai_nightly-2025.10.5.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|