fbgemm-gpu-genai-nightly 2025.9.29__cp311-cp311-manylinux_2_28_x86_64.whl → 2025.10.2__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

fbgemm_gpu/asmjit.so CHANGED
Binary file
@@ -6,6 +6,6 @@
6
6
  # This source code is licensed under the BSD-style license found in the
7
7
  # LICENSE file in the root directory of this source tree.
8
8
 
9
- __version__: str = "2025.9.29"
9
+ __version__: str = "2025.10.2"
10
10
  __target__: str = "genai"
11
11
  __variant__: str = "cuda"
@@ -3814,7 +3814,6 @@ def get_full_non_persistent_tuning_space():
3814
3814
  num_stages=num_stages,
3815
3815
  )
3816
3816
  )
3817
- logger.info(f"all configs #: {len(configs)}")
3818
3817
  return configs
3819
3818
 
3820
3819
 
@@ -3839,6 +3838,8 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
3839
3838
  (128, 64, 64, 4, 1, 0, 16, 2, 4, 2),
3840
3839
  (128, 64, 64, 1, 1, 0, 16, 2, 4, 2),
3841
3840
  (256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
3841
+ (128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
3842
+ (256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
3842
3843
  ]
3843
3844
 
3844
3845
 
@@ -13,13 +13,13 @@ from torch.library import register_fake
13
13
  torch.library.define(
14
14
  "blackwell_fmha::fmha_fwd",
15
15
  "(Tensor q, Tensor k, Tensor v, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, Tensor? seqlen_kv) -> (Tensor, Tensor)",
16
- tags=[torch.Tag.pt2_compliant_tag],
16
+ tags=torch.Tag.pt2_compliant_tag,
17
17
  )
18
18
 
19
19
  torch.library.define(
20
20
  "blackwell_fmha::fmha_bwd",
21
21
  "(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, bool? causal) -> (Tensor, Tensor, Tensor)",
22
- tags=[torch.Tag.pt2_compliant_tag],
22
+ tags=torch.Tag.pt2_compliant_tag,
23
23
  )
24
24
 
25
25
 
fbgemm_gpu/fbgemm.so CHANGED
Binary file
@@ -3971,8 +3971,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
3971
3971
  self.step, stats_reporter.report_interval # pyre-ignore
3972
3972
  )
3973
3973
 
3974
- if len(dram_kv_perf_stats) != 23:
3975
- logging.error("dram cache perf stats should have 23 elements")
3974
+ if len(dram_kv_perf_stats) != 24:
3975
+ logging.error("dram cache perf stats should have 24 elements")
3976
3976
  return
3977
3977
 
3978
3978
  dram_read_duration = dram_kv_perf_stats[0]
@@ -4001,6 +4001,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
4001
4001
  dram_kv_allocated_bytes = dram_kv_perf_stats[20]
4002
4002
  dram_kv_actual_used_chunk_bytes = dram_kv_perf_stats[21]
4003
4003
  dram_kv_num_rows = dram_kv_perf_stats[22]
4004
+ dram_kv_read_counts = dram_kv_perf_stats[23]
4004
4005
 
4005
4006
  stats_reporter.report_duration(
4006
4007
  iteration_step=self.step,
@@ -4142,6 +4143,13 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
4142
4143
  enable_tb_metrics=True,
4143
4144
  )
4144
4145
 
4146
+ stats_reporter.report_data_amount(
4147
+ iteration_step=self.step,
4148
+ event_name="dram_kv.perf.get.dram_kv_read_counts",
4149
+ data_bytes=dram_kv_read_counts,
4150
+ enable_tb_metrics=True,
4151
+ )
4152
+
4145
4153
  stats_reporter.report_data_amount(
4146
4154
  iteration_step=self.step,
4147
4155
  event_name=self.dram_kv_allocated_bytes_stats_name,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fbgemm_gpu_genai_nightly
3
- Version: 2025.9.29
3
+ Version: 2025.10.2
4
4
  Home-page: https://github.com/pytorch/fbgemm
5
5
  Author: FBGEMM Team
6
6
  Author-email: packages@pytorch.org
@@ -1,8 +1,8 @@
1
1
  fbgemm_gpu/__init__.py,sha256=FdQCmpvETH80tlIPP6W8MrOmzLaX9eoGY-fuHtVPbj0,5747
2
- fbgemm_gpu/asmjit.so,sha256=RxTYI8zY4PpIBRpSKT_-U7bRIVeTRohdtRFUmLNU1tQ,501728
2
+ fbgemm_gpu/asmjit.so,sha256=tf4jzD7HrM0qcrQl_7Q_y3FJ62duD7b3tKkh5TAN7k8,484232
3
3
  fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
4
4
  fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
5
- fbgemm_gpu/fbgemm.so,sha256=P-80NThzhyQWN9WMb2kYfl04sAgPCehbdArfnktJaqw,5634424
5
+ fbgemm_gpu/fbgemm.so,sha256=HdsyKYHtVNz5ZNpTcSI4SDuYAGks1hYeVz45ZPjuJts,5790800
6
6
  fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
7
7
  fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
8
8
  fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
@@ -32,21 +32,21 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
32
32
  fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
33
33
  fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
34
34
  fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
35
- fbgemm_gpu/docs/version.py,sha256=F5j2fIxoKrCau1X_4joWddM6S3n3XwdUZvZg5Gc_F48,316
35
+ fbgemm_gpu/docs/version.py,sha256=U9HFTyqt_827sXJZ7N9Dik7e18vj0x7B38Go9HoScG4,316
36
36
  fbgemm_gpu/experimental/example/__init__.py,sha256=V_XrGMq2oNVMpzwe1srlaTaHeIcZJw5oAGbo3seM_Ks,870
37
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=-VrIV7q3ZUbZSwgISxDhp5ch8YiOkC6ftvwwQ9UNcb8,243904
37
+ fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=1dZS9lk6C74qym6nRYRikoQlfcRelfkL4i8v2stWylY,232488
38
38
  fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
39
39
  fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=AqHefiOaN_SjP5ew7RYGuKFuSlhedOJL_6f97TtLv7c,566
40
40
  fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=I2xf2DlU27KA9s0256tkGLhdOoImUv7i7oHc8bz5Y2M,211841
41
- fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=iB13OssIfH1ShYn-L-Kwd0hFrvkInFnK9YXw248EdMA,152719
41
+ fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
42
42
  fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
43
43
  fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
44
44
  fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
45
45
  fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=qwfuF5E5K4oDiH7RJkpC7zth3kAsG7wv_glCl2A_G2A,1860
46
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=2Xp9jIeo84oH-Y0sjSU5KC5eAjU_52lZAap03P3E33c,78714952
46
+ fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=crMx6rSmrjg53kw2gLRXCksM4G-vnqDzxEmHFuRuPYk,78050824
47
47
  fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
48
48
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=oExepXpjMOwM43gARZARY0UtR-EX2zqRnSrOaQPy448,1044
49
- fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=D90VgPOxnx1NpnDnajIv_L7AHq4rrmFIch0iV2elAVU,7825
49
+ fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
50
50
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=sWk5888_e7Qhfik0X1uZ1VsEUmY5uidxHcqtpoH42Is,9406
51
51
  fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=GvCUF6o7wCR3XSWingWKxn_Y3_F2GhZtOIRAB3pfqK0,551
52
52
  fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
@@ -99,7 +99,7 @@ fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29Xt
99
99
  fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
100
100
  fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
101
101
  fbgemm_gpu/tbe/ssd/inference.py,sha256=B_uX66ajGA9YKGlFa5TmGWs7b-b1RFigzwxmENZ9Oio,22816
102
- fbgemm_gpu/tbe/ssd/training.py,sha256=dnUHnl9FzPneXrWUCA_HKvLI5QHQnL0vd9neMrjZJ-Q,194623
102
+ fbgemm_gpu/tbe/ssd/training.py,sha256=iepmavrK6cSXrqWg3TqVXL5gVmSftfHJycesp7I0Nw4,194911
103
103
  fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
104
104
  fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
105
105
  fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
121
121
  fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
122
122
  list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
123
123
  list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
124
- fbgemm_gpu_genai_nightly-2025.9.29.dist-info/METADATA,sha256=EJ6RwlJa0HCpvnSKgLfI9lNQm95VHwHh9RosQp900Fg,2655
125
- fbgemm_gpu_genai_nightly-2025.9.29.dist-info/WHEEL,sha256=V2Q6mQKbouIadCxoRjt9FQ9oKfi45-uZUcoc77zzs0M,108
126
- fbgemm_gpu_genai_nightly-2025.9.29.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
- fbgemm_gpu_genai_nightly-2025.9.29.dist-info/RECORD,,
124
+ fbgemm_gpu_genai_nightly-2025.10.2.dist-info/METADATA,sha256=KiCiRdg53J2HiyUZMdm_uIZHb-E8u0QQj9uRSc9oRIM,2655
125
+ fbgemm_gpu_genai_nightly-2025.10.2.dist-info/WHEEL,sha256=V2Q6mQKbouIadCxoRjt9FQ9oKfi45-uZUcoc77zzs0M,108
126
+ fbgemm_gpu_genai_nightly-2025.10.2.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
+ fbgemm_gpu_genai_nightly-2025.10.2.dist-info/RECORD,,