fbgemm-gpu-genai-nightly 2025.10.2__cp313-cp313-manylinux_2_28_x86_64.whl → 2025.10.5__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

fbgemm_gpu/asmjit.so CHANGED
Binary file
@@ -11,7 +11,7 @@ from enum import auto, Enum
11
11
  import torch
12
12
 
13
13
  try:
14
- torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp")
14
+ torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp_torch_op")
15
15
  except Exception:
16
16
  import fbgemm_gpu # noqa F401
17
17
 
@@ -6,6 +6,6 @@
6
6
  # This source code is licensed under the BSD-style license found in the
7
7
  # LICENSE file in the root directory of this source tree.
8
8
 
9
- __version__: str = "2025.10.2"
9
+ __version__: str = "2025.10.5"
10
10
  __target__: str = "genai"
11
11
  __variant__: str = "cuda"
@@ -5540,3 +5540,53 @@ def calculate_group_max(
5540
5540
  USE_INT64=use_int64,
5541
5541
  )
5542
5542
  return out, tensor_idx
5543
+
5544
+
5545
+ def get_nvfp4_global_scales_naive(
5546
+ xs: list[torch.Tensor], ws: list[torch.Tensor]
5547
+ ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
5548
+ """
5549
+ Get global scales for each tensor in xs and ws.
5550
+ This is done "naively" (not efficiently with a kernel). This function is used in unit tests or debugging.
5551
+ """
5552
+ global_scales = []
5553
+ x_global_scales = []
5554
+ w_global_scales = []
5555
+
5556
+ for x, w in zip(xs, ws):
5557
+ # pyre-ignore
5558
+ x_global_scale: torch.Tensor = (448.0 * 6.0) / torch.amax(
5559
+ torch.abs(x.flatten()), dim=-1
5560
+ ).to(torch.float32)
5561
+ # pyre-ignore
5562
+ w_global_scale: torch.Tensor = (448.0 * 6.0) / torch.amax(
5563
+ torch.abs(w.flatten()), dim=-1
5564
+ ).to(torch.float32)
5565
+ # pyre-ignore
5566
+ global_scale: torch.Tensor = 1 / (x_global_scale * w_global_scale)
5567
+
5568
+ global_scales.append(global_scale)
5569
+ x_global_scales.append(x_global_scale)
5570
+ w_global_scales.append(w_global_scale)
5571
+
5572
+ return global_scales, x_global_scales, w_global_scales
5573
+
5574
+
5575
+ def quantize_nvfp4_naive(
5576
+ xs: list[torch.Tensor], global_scales: list[torch.Tensor]
5577
+ ) -> tuple[
5578
+ list[torch.Tensor],
5579
+ list[torch.Tensor],
5580
+ ]:
5581
+ """
5582
+ Quantize A to NVFP4 format.
5583
+ This is done "naively" using a kernel for each group. This function is largely used in unit tests or debugging.
5584
+ """
5585
+ xqs, x_scales = zip(
5586
+ *(
5587
+ triton_scale_nvfp4_quant(x, global_scale)
5588
+ for x, global_scale in zip(xs, global_scales)
5589
+ )
5590
+ )
5591
+
5592
+ return xqs, x_scales
@@ -16,9 +16,11 @@ import triton # @manual=//triton:triton
16
16
  from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
17
17
  _to_blocked,
18
18
  calculate_group_max,
19
+ get_nvfp4_global_scales_naive,
19
20
  mega_fp4_pack,
20
21
  mega_fp4_quantize_kernel,
21
22
  mega_fp4_unpack,
23
+ quantize_nvfp4_naive,
22
24
  triton_quantize_mx4_unpack,
23
25
  triton_scale_nvfp4_quant,
24
26
  triton_scale_nvfp4_quant_rms,
@@ -96,6 +98,10 @@ except ImportError:
96
98
  quantize_op_registry = []
97
99
 
98
100
 
101
+ def round_up(x: int, y: int) -> int:
102
+ return ((x + y - 1) // y) * y
103
+
104
+
99
105
  class QuantizeOpBase(metaclass=abc.ABCMeta):
100
106
  """Helper abstract class to define expected methods of quantize ops."""
101
107
 
@@ -2592,6 +2598,336 @@ class MXFP4StackedGroupedGemm(QuantizeOpBase):
2592
2598
  return True
2593
2599
 
2594
2600
 
2601
+ @register_quantize_op
2602
+ class MXFP4GroupedGemm2D3D(QuantizeOpBase):
2603
+ """
2604
+ MXFP4 grouped GEMM with blockwise scaling and Torch 2D2D API.
2605
+ """
2606
+
2607
+ def preprocess(self, xs, ws):
2608
+ m_sizes = [x.shape[0] for x in xs]
2609
+ m_offsets = torch.cumsum(torch.tensor(m_sizes), dim=0).to(
2610
+ dtype=torch.int32, device=xs[0].device
2611
+ )
2612
+
2613
+ wqs = []
2614
+ w_scales = []
2615
+ for w in ws:
2616
+ wq, w_scale = triton_quantize_mx4_unpack(w)
2617
+ wqs.append(wq)
2618
+ w_scales.append(w_scale)
2619
+
2620
+ wq = torch.stack(wqs, dim=0)
2621
+ w_scale = torch.stack(w_scales, dim=0)
2622
+
2623
+ return xs, wq, w_scale, m_offsets
2624
+
2625
+ def quantize(self, xs, wq, w_scale, m_offsets):
2626
+ xqs = []
2627
+ x_scales = []
2628
+ for x in xs:
2629
+ xq, x_scale = triton_quantize_mx4_unpack(x)
2630
+ xqs.append(xq)
2631
+ x_scales.append(x_scale)
2632
+
2633
+ xq = torch.cat(xqs, dim=0)
2634
+ x_scale = torch.stack(x_scales, dim=0)
2635
+
2636
+ xq = xq.view(torch.float4_e2m1fn_x2)
2637
+ wq = wq.view(torch.float4_e2m1fn_x2)
2638
+ x_scale = x_scale.view(torch.float8_e8m0fnu)
2639
+ w_scale = w_scale.view(torch.float8_e8m0fnu)
2640
+
2641
+ return xq, wq, x_scale, w_scale, m_offsets
2642
+
2643
+ def compute(
2644
+ self,
2645
+ xq,
2646
+ wq,
2647
+ x_scale,
2648
+ w_scale,
2649
+ m_offsets,
2650
+ ):
2651
+ return torch.ops.fbgemm.f4f4bf16_grouped_mm(
2652
+ xq,
2653
+ wq.transpose(-2, -1),
2654
+ x_scale,
2655
+ w_scale,
2656
+ m_offsets,
2657
+ )
2658
+
2659
+ def quantize_and_compute(self, xs, wq, w_scale, m_offsets, output):
2660
+ args = self.quantize(xs, wq, w_scale, m_offsets, output)
2661
+ return self.compute(**args)
2662
+
2663
+ @property
2664
+ def name(self) -> str:
2665
+ return "cutlass_mx_f4f4bf16_grouped_mm_2d_3d"
2666
+
2667
+ @property
2668
+ def cuda(self) -> bool:
2669
+ return True
2670
+
2671
+ @property
2672
+ def hip(self) -> bool:
2673
+ return False
2674
+
2675
+
2676
+ @register_quantize_op
2677
+ class MXFP4GroupedGemm2D2D(QuantizeOpBase):
2678
+ """
2679
+ MXFP4 grouped GEMM with blockwise scaling and Torch 2D2D API.
2680
+ """
2681
+
2682
+ def preprocess(self, xs, ws):
2683
+ k_sizes = [x.shape[1] for x in xs]
2684
+ k_offsets = torch.cumsum(torch.tensor(k_sizes), dim=0).to(
2685
+ dtype=torch.int32, device=xs[0].device
2686
+ )
2687
+
2688
+ wqs = []
2689
+ w_scales = []
2690
+ for w in ws:
2691
+ wq, w_scale = triton_quantize_mx4_unpack(w)
2692
+ wqs.append(wq)
2693
+ w_scales.append(w_scale)
2694
+
2695
+ wq = torch.cat(wqs, dim=1)
2696
+ w_scale = torch.stack(w_scales, dim=0)
2697
+
2698
+ return xs, wq, w_scale, k_offsets
2699
+
2700
+ def quantize(self, xs, wq, w_scale, k_offsets):
2701
+ xqs = []
2702
+ x_scales = []
2703
+ for x in xs:
2704
+ xq, x_scale = triton_quantize_mx4_unpack(x)
2705
+ xqs.append(xq)
2706
+ x_scales.append(x_scale)
2707
+
2708
+ xq = torch.cat(xqs, dim=1)
2709
+ x_scale = torch.stack(x_scales, dim=0)
2710
+
2711
+ xq = xq.view(torch.float4_e2m1fn_x2)
2712
+ wq = wq.view(torch.float4_e2m1fn_x2)
2713
+ x_scale = x_scale.view(torch.float8_e8m0fnu)
2714
+ w_scale = w_scale.view(torch.float8_e8m0fnu)
2715
+
2716
+ return xq, wq, x_scale, w_scale, k_offsets
2717
+
2718
+ def compute(
2719
+ self,
2720
+ xq,
2721
+ wq,
2722
+ x_scale,
2723
+ w_scale,
2724
+ k_offsets,
2725
+ ):
2726
+ return torch.ops.fbgemm.f4f4bf16_grouped_mm(
2727
+ xq,
2728
+ wq.transpose(-2, -1),
2729
+ x_scale,
2730
+ w_scale,
2731
+ k_offsets,
2732
+ )
2733
+
2734
+ def quantize_and_compute(self, xs, wq, w_scale, k_offsets, output):
2735
+ args = self.quantize(xs, wq, w_scale, k_offsets, output)
2736
+ return self.compute(**args)
2737
+
2738
+ @property
2739
+ def name(self) -> str:
2740
+ return "cutlass_mx_f4f4bf16_grouped_mm_2d_2d"
2741
+
2742
+ @property
2743
+ def cuda(self) -> bool:
2744
+ return True
2745
+
2746
+ @property
2747
+ def hip(self) -> bool:
2748
+ return False
2749
+
2750
+
2751
+ @register_quantize_op
2752
+ class NVFP4GroupedGemm2D3D(QuantizeOpBase):
2753
+ """
2754
+ NVFP4 grouped GEMM with blockwise scaling and Torch 2D3D API.
2755
+ """
2756
+
2757
+ def preprocess(self, x, w):
2758
+ m_values = [i.shape[0] for i in x]
2759
+ m_sizes = torch.tensor(m_values).to(dtype=torch.int64, device=x[0].device)
2760
+ x = torch.concat(x, dim=0).contiguous()
2761
+
2762
+ def get_global_scale(x, w, m_sizes):
2763
+ G = len(w)
2764
+ w_global_scale = []
2765
+ global_scale = []
2766
+
2767
+ x_global_scale, tensor_idx = calculate_group_max(x, m_sizes=m_sizes)
2768
+
2769
+ for i in range(G):
2770
+ w_global_scale_ = (448.0 * 6.0) / torch.amax(
2771
+ torch.abs(w[i].flatten()), dim=-1
2772
+ ).to(torch.float32)
2773
+
2774
+ global_scale_ = 1 / (x_global_scale[i] * w_global_scale_)
2775
+
2776
+ w_global_scale.append(w_global_scale_)
2777
+ global_scale.append(global_scale_)
2778
+
2779
+ return x_global_scale, w_global_scale, global_scale, tensor_idx
2780
+
2781
+ # Compute global scale for each group
2782
+ G = m_sizes.numel()
2783
+ x_global_scale, w_global_scale, global_scale, tensor_idx = get_global_scale(
2784
+ x, w, m_sizes
2785
+ )
2786
+ global_scale = torch.stack(global_scale, dim=0).contiguous()
2787
+
2788
+ wq, w_scale = zip(
2789
+ *[triton_scale_nvfp4_quant(w[i], w_global_scale[i]) for i in range(G)]
2790
+ )
2791
+ wq = torch.stack(wq, dim=0).contiguous()
2792
+ w_scale = torch.stack(w_scale, dim=0).contiguous()
2793
+
2794
+ return x, wq, w_scale, x_global_scale, global_scale, m_sizes, tensor_idx
2795
+
2796
+ def quantize(
2797
+ self, x, wq, w_scale, x_global_scale, global_scale, m_sizes, tensor_idx
2798
+ ):
2799
+ xq, x_scale, _ = mega_fp4_quantize_kernel(
2800
+ m_sizes, x, x_global_scale, optional_tensor_idx=tensor_idx
2801
+ )
2802
+
2803
+ x_scale = x_scale.reshape(-1, x.shape[1] // 16)
2804
+ offsets = torch.cumsum(m_sizes, dim=0).to(torch.int32)
2805
+
2806
+ xq = xq.view(torch.float4_e2m1fn_x2)
2807
+ wq = wq.view(torch.float4_e2m1fn_x2)
2808
+ x_scale = x_scale.view(torch.float8_e4m3fn)
2809
+ w_scale = w_scale.view(torch.float8_e4m3fn)
2810
+
2811
+ return (
2812
+ xq,
2813
+ wq.transpose(-2, -1),
2814
+ x_scale,
2815
+ w_scale,
2816
+ offsets,
2817
+ None,
2818
+ global_scale,
2819
+ )
2820
+
2821
+ def compute(
2822
+ self,
2823
+ xq,
2824
+ wq,
2825
+ x_scale,
2826
+ w_scale,
2827
+ offsets,
2828
+ output,
2829
+ global_scale,
2830
+ ):
2831
+ return torch.ops.fbgemm.f4f4bf16_grouped_mm(
2832
+ xq,
2833
+ wq,
2834
+ x_scale,
2835
+ w_scale,
2836
+ offsets,
2837
+ output,
2838
+ global_scale,
2839
+ )
2840
+
2841
+ def quantize_and_compute(self, xq, wq, x_scale, w_scale, global_scale, k_offsets):
2842
+ args = self.quantize(xq, wq, x_scale, w_scale, global_scale, k_offsets)
2843
+ return self.compute(**args)
2844
+
2845
+ @property
2846
+ def name(self) -> str:
2847
+ return "cutlass_nv_f4f4bf16_grouped_mm_2d_3d"
2848
+
2849
+ @property
2850
+ def hip(self) -> bool:
2851
+ return False
2852
+
2853
+ @property
2854
+ def cuda(self) -> bool:
2855
+ return True
2856
+
2857
+
2858
+ @register_quantize_op
2859
+ class NVFP4GroupedGemm2D2D(QuantizeOpBase):
2860
+ """
2861
+ NVFP4 grouped GEMM with blockwise scaling and Torch 2D2D API.
2862
+ """
2863
+
2864
+ def preprocess(self, xs, ws):
2865
+ k_sizes = [x.shape[1] for x in xs]
2866
+ k_offsets = torch.cumsum(torch.tensor(k_sizes), dim=0).to(
2867
+ dtype=torch.int32, device=xs[0].device
2868
+ )
2869
+
2870
+ global_scales, x_global_scales, w_global_scales = get_nvfp4_global_scales_naive(
2871
+ xs, ws
2872
+ )
2873
+ wqs, w_scales = quantize_nvfp4_naive(ws, w_global_scales)
2874
+ wq = torch.cat(wqs, dim=1).view(torch.float4_e2m1fn_x2)
2875
+ w_scale = (
2876
+ torch.stack(w_scales, dim=0)
2877
+ .reshape(round_up(wq.size(0), 128), -1)
2878
+ .view(torch.float8_e4m3fn)
2879
+ )
2880
+ global_scale = torch.stack(global_scales, dim=0)
2881
+
2882
+ return xs, wq, w_scale, global_scale, x_global_scales, k_offsets
2883
+
2884
+ def quantize(self, xs, wq, w_scale, global_scale, x_global_scales, k_offsets):
2885
+ xqs, x_scales = quantize_nvfp4_naive(xs, x_global_scales)
2886
+ xq = torch.cat(xqs, dim=1).view(torch.float4_e2m1fn_x2)
2887
+ x_scale = (
2888
+ torch.stack(x_scales, dim=0)
2889
+ .reshape(round_up(xq.size(0), 128), -1)
2890
+ .view(torch.float8_e4m3fn)
2891
+ )
2892
+
2893
+ return xq, wq, x_scale, w_scale, k_offsets, global_scale
2894
+
2895
+ def compute(
2896
+ self,
2897
+ xq,
2898
+ wq,
2899
+ x_scale,
2900
+ w_scale,
2901
+ k_offsets,
2902
+ global_scale,
2903
+ ):
2904
+ return torch.ops.fbgemm.f4f4bf16_grouped_mm(
2905
+ xq,
2906
+ wq.transpose(-2, -1),
2907
+ x_scale,
2908
+ w_scale,
2909
+ k_offsets,
2910
+ None,
2911
+ global_scale,
2912
+ )
2913
+
2914
+ def quantize_and_compute(self, xq, wq, x_scale, w_scale, global_scale, k_offsets):
2915
+ args = self.quantize(xq, wq, x_scale, w_scale, global_scale, k_offsets)
2916
+ return self.compute(**args)
2917
+
2918
+ @property
2919
+ def name(self) -> str:
2920
+ return "cutlass_nv_f4f4bf16_grouped_mm_2d_2d"
2921
+
2922
+ @property
2923
+ def hip(self) -> bool:
2924
+ return False
2925
+
2926
+ @property
2927
+ def cuda(self) -> bool:
2928
+ return True
2929
+
2930
+
2595
2931
  @register_quantize_op
2596
2932
  class NVFP4StackedGroupedGemm(QuantizeOpBase):
2597
2933
  """
@@ -3064,9 +3400,6 @@ class MXFP8GroupedGemm2d2d(QuantizeOpBase):
3064
3400
  x_blocked_scale_list = []
3065
3401
  w_blocked_scale_list = []
3066
3402
 
3067
- def round_up(x: int, y: int) -> int:
3068
- return ((x + y - 1) // y) * y
3069
-
3070
3403
  for group_idx in range(G):
3071
3404
  # to_mxfp8 per group
3072
3405
  prev_group_end_offset = (
fbgemm_gpu/fbgemm.so CHANGED
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fbgemm_gpu_genai_nightly
3
- Version: 2025.10.2
3
+ Version: 2025.10.5
4
4
  Home-page: https://github.com/pytorch/fbgemm
5
5
  Author: FBGEMM Team
6
6
  Author-email: packages@pytorch.org
@@ -1,8 +1,8 @@
1
1
  fbgemm_gpu/__init__.py,sha256=FdQCmpvETH80tlIPP6W8MrOmzLaX9eoGY-fuHtVPbj0,5747
2
- fbgemm_gpu/asmjit.so,sha256=UxnhHlu9LgmoRXa8fZwSX56b5QKffBxfAOs0AZLxRfk,501728
2
+ fbgemm_gpu/asmjit.so,sha256=PFeEgzpuz45ai1N1fj0C87yHOw9OZFoW6N9VZHyxxHI,484232
3
3
  fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
4
4
  fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
5
- fbgemm_gpu/fbgemm.so,sha256=P-80NThzhyQWN9WMb2kYfl04sAgPCehbdArfnktJaqw,5634424
5
+ fbgemm_gpu/fbgemm.so,sha256=E4-lI4QpwkjkPmH3u1IKBUjBEDrdbL6YgeFnhIt5YKo,5811328
6
6
  fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
7
7
  fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
8
8
  fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
@@ -23,7 +23,7 @@ fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc
23
23
  fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
24
24
  fbgemm_gpu/uvm.py,sha256=guNK8ZzR80jmv-CyRgEhxhVYhjz3R9d6tB8Hu1uWDUo,1047
25
25
  fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
26
- fbgemm_gpu/config/feature_list.py,sha256=P9lL-6e6sy0qrkMPMljhT1ibtU45f8hg5otFCNz96EA,2350
26
+ fbgemm_gpu/config/feature_list.py,sha256=iDOGr9nwTqUhWsqOefRIqIo1jwLSeSII4jGnLeU01kg,2359
27
27
  fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
28
28
  fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
29
29
  fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
@@ -32,18 +32,18 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
32
32
  fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
33
33
  fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
34
34
  fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
35
- fbgemm_gpu/docs/version.py,sha256=U9HFTyqt_827sXJZ7N9Dik7e18vj0x7B38Go9HoScG4,316
35
+ fbgemm_gpu/docs/version.py,sha256=axMqnt_uxHLVuIT4M2QVOCEQgEYGPpnbD2G5jg9tAXA,316
36
36
  fbgemm_gpu/experimental/example/__init__.py,sha256=V_XrGMq2oNVMpzwe1srlaTaHeIcZJw5oAGbo3seM_Ks,870
37
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=FhRsyrR4lvC-pV4G8-TV8YNOjZfH-rgSq4kUb4-rlBk,243904
37
+ fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=kqkxwgho3f7Y28tPavr-Q6Rn6BoGJiBWSXqXmNE5oOw,232488
38
38
  fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
39
39
  fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=AqHefiOaN_SjP5ew7RYGuKFuSlhedOJL_6f97TtLv7c,566
40
- fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=I2xf2DlU27KA9s0256tkGLhdOoImUv7i7oHc8bz5Y2M,211841
40
+ fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=qJmQOBa9iW-HhRYm8lzE36Lz7vpBevCS6pWQyy33pag,213404
41
41
  fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
42
42
  fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
43
43
  fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
44
44
  fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
45
45
  fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=qwfuF5E5K4oDiH7RJkpC7zth3kAsG7wv_glCl2A_G2A,1860
46
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=25yn_zfQ4jsSvwBvM2ctQwu-ppkEpSuVILoFvXJdr-I,78714952
46
+ fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=JM9t3lasZhpw08f_Z0ZQj8pe1BCqagOVUNhJST0qTps,77952696
47
47
  fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
48
48
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=oExepXpjMOwM43gARZARY0UtR-EX2zqRnSrOaQPy448,1044
49
49
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
@@ -52,7 +52,7 @@ fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=GvCUF6o7wCR3XSWingWKxn_Y
52
52
  fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
53
53
  fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
54
54
  fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py,sha256=BWl6t-4acbuRSEX2aVNDlFrSWZkqMWK2sI3VONaMd3Q,24047
55
- fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=9LHqmEafRadDP4m0LHX1zd0PiAMIg83s_Q8ebgZnoMg,95796
55
+ fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=H6AchejyZs76_snM_ae5vV0cPr_Q0h35OQ8qED0r1N4,104915
56
56
  fbgemm_gpu/experimental/gen_ai/moe/README.md,sha256=z9ybHmv4KFJ1drj5OByuFaOY0tRQwwiIW3Q22TB_2-k,904
57
57
  fbgemm_gpu/experimental/gen_ai/moe/__init__.py,sha256=XzFeGAmhJ-QvsUyzmlRZWExvaVJ1CbBk3ENo-LK3KHk,2052
58
58
  fbgemm_gpu/experimental/gen_ai/moe/activation.py,sha256=NiXhWyCNagI3P9N3N89iSX7xKuShdkq9DxEUAzoV6y0,7892
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
121
121
  fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
122
122
  list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
123
123
  list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
124
- fbgemm_gpu_genai_nightly-2025.10.2.dist-info/METADATA,sha256=KiCiRdg53J2HiyUZMdm_uIZHb-E8u0QQj9uRSc9oRIM,2655
125
- fbgemm_gpu_genai_nightly-2025.10.2.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
126
- fbgemm_gpu_genai_nightly-2025.10.2.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
- fbgemm_gpu_genai_nightly-2025.10.2.dist-info/RECORD,,
124
+ fbgemm_gpu_genai_nightly-2025.10.5.dist-info/METADATA,sha256=rTogIn95pgowlTBehwDMPRA5MmXP09AbRW_k4y12u84,2655
125
+ fbgemm_gpu_genai_nightly-2025.10.5.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
126
+ fbgemm_gpu_genai_nightly-2025.10.5.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
+ fbgemm_gpu_genai_nightly-2025.10.5.dist-info/RECORD,,