fbgemm-gpu-genai-nightly 2025.10.20__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.10.25__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

fbgemm_gpu/__init__.py CHANGED
@@ -15,7 +15,6 @@ import torch
15
15
  # Based on the FBGEMM-PyTorch compatibility table at
16
16
  # https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
17
17
  _fbgemm_torch_compat_table = {
18
- "1.4": "2.9",
19
18
  "1.3": "2.8",
20
19
  "1.2": "2.7",
21
20
  "1.1": "2.6",
@@ -1,6 +1,6 @@
1
1
 
2
2
  {
3
- "version": "2025.10.20",
3
+ "version": "2025.10.25",
4
4
  "target": "genai",
5
5
  "variant": "cuda"
6
6
  }
@@ -3840,6 +3840,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
3840
3840
  (256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
3841
3841
  (128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
3842
3842
  (256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
3843
+ (128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
3844
+ (128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
3845
+ (128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
3846
+ (128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
3843
3847
  ]
3844
3848
 
3845
3849
 
@@ -61,6 +61,8 @@ def _cutlass_blackwell_fmha_forward(
61
61
  softmax_scale: float | None = None,
62
62
  causal: bool = False,
63
63
  seqlen_kv: torch.Tensor | None = None,
64
+ page_table: torch.Tensor | None = None,
65
+ seqlen_k: int | None = None,
64
66
  window_left: int = -1,
65
67
  window_right: int = -1,
66
68
  bottom_right: bool = True,
@@ -79,6 +81,8 @@ def _cutlass_blackwell_fmha_forward(
79
81
  softmax_scale=softmax_scale,
80
82
  causal=causal,
81
83
  seqlen_kv=seqlen_kv,
84
+ page_table=page_table,
85
+ seqlen_k=seqlen_k,
82
86
  window_size_left=window_left,
83
87
  window_size_right=window_right,
84
88
  bottom_right=bottom_right,
@@ -171,6 +175,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
171
175
  max_seq_len_q: Optional[int] = None,
172
176
  max_seq_len_k: Optional[int] = None,
173
177
  seqlen_kv: Optional[torch.Tensor] = None,
178
+ page_table: Optional[torch.Tensor] = None,
179
+ seqlen_k: Optional[int] = None,
174
180
  window_size: tuple[int, int] = (-1, -1),
175
181
  bottom_right: bool = True,
176
182
  deterministic: bool = False,
@@ -220,6 +226,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
220
226
  softmax_scale,
221
227
  causal,
222
228
  seqlen_kv,
229
+ page_table,
230
+ seqlen_k,
223
231
  window_left,
224
232
  window_right,
225
233
  bottom_right,
@@ -252,6 +260,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
252
260
  None,
253
261
  None,
254
262
  None,
263
+ None,
264
+ None,
255
265
  ]:
256
266
  if ctx.is_gen:
257
267
  # For gen case, no backward pass is needed (generation is inference only)
@@ -279,7 +289,23 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
279
289
  bottom_right=ctx.bottom_right,
280
290
  deterministic=ctx.deterministic,
281
291
  )
282
- return dq, dk, dv, None, None, None, None, None, None, None, None, None, None
292
+ return (
293
+ dq,
294
+ dk,
295
+ dv,
296
+ None,
297
+ None,
298
+ None,
299
+ None,
300
+ None,
301
+ None,
302
+ None,
303
+ None,
304
+ None,
305
+ None,
306
+ None,
307
+ None,
308
+ )
283
309
 
284
310
 
285
311
  def cutlass_blackwell_fmha_func(
@@ -293,6 +319,8 @@ def cutlass_blackwell_fmha_func(
293
319
  max_seq_len_q: int | None = None,
294
320
  max_seq_len_k: int | None = None,
295
321
  seqlen_kv: torch.Tensor | None = None,
322
+ page_table: torch.Tensor | None = None,
323
+ seqlen_k: int | None = None,
296
324
  window_size: tuple[int, int] | None = (-1, -1),
297
325
  bottom_right: bool = True,
298
326
  deterministic: bool = False,
@@ -308,6 +336,8 @@ def cutlass_blackwell_fmha_func(
308
336
  max_seq_len_q,
309
337
  max_seq_len_k,
310
338
  seqlen_kv,
339
+ page_table,
340
+ seqlen_k,
311
341
  window_size,
312
342
  bottom_right,
313
343
  deterministic,
@@ -175,7 +175,7 @@ class QuantizeOpBase(metaclass=abc.ABCMeta):
175
175
  if use_cuda_graph:
176
176
  with torch.cuda.stream(torch.cuda.Stream()):
177
177
  t = triton.testing.do_bench_cudagraph(
178
- lambda: self.quantize_and_compute(*args, **kwargs)
178
+ lambda: self.quantize_and_compute(*args, **kwargs), rep=200
179
179
  )
180
180
  else:
181
181
  t = triton.testing.do_bench(
@@ -188,7 +188,7 @@ class QuantizeOpBase(metaclass=abc.ABCMeta):
188
188
  if use_cuda_graph:
189
189
  with torch.cuda.stream(torch.cuda.Stream()):
190
190
  t = triton.testing.do_bench_cudagraph(
191
- lambda: self.compute(*args, **kwargs)
191
+ lambda: self.compute(*args, **kwargs), rep=200
192
192
  )
193
193
  else:
194
194
  t = triton.testing.do_bench(lambda: self.compute(*args, **kwargs))
@@ -1714,10 +1714,129 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1714
1714
  tbe_id=self.uuid,
1715
1715
  )
1716
1716
 
1717
- @torch.jit.ignore
1718
- def _report_tbe_mem_usage(
1717
+ def _get_tensor_memory(self, tensor_name: str) -> int:
1718
+ """Get memory usage of a tensor in bytes."""
1719
+ if not hasattr(self, tensor_name):
1720
+ self.log(f"Tensor '{tensor_name}' not found, using 0 bytes")
1721
+ return 0
1722
+ tensor = getattr(self, tensor_name)
1723
+ return tensor.numel() * tensor.element_size()
1724
+
1725
+ def _categorize_memory_by_location(
1726
+ self, tensor_names: list[str]
1727
+ ) -> tuple[int, int]:
1728
+ """Categorize memory into HBM and UVM for given tensors.
1729
+
1730
+ Returns:
1731
+ (hbm_bytes, uvm_bytes)
1732
+ """
1733
+ uvm_set = set(self._uvm_tensors_log)
1734
+ hbm_bytes = 0
1735
+ uvm_bytes = 0
1736
+
1737
+ for name in tensor_names:
1738
+ size = self._get_tensor_memory(name)
1739
+ if name in uvm_set:
1740
+ uvm_bytes += size
1741
+ else:
1742
+ hbm_bytes += size
1743
+
1744
+ return hbm_bytes, uvm_bytes
1745
+
1746
+ def _report_hbm_breakdown(
1747
+ self,
1748
+ stats_reporter: TBEStatsReporter,
1749
+ embeddings: int,
1750
+ optimizer_states: int,
1751
+ cache: int,
1752
+ total_static_sparse: int,
1753
+ ephemeral: int,
1754
+ ) -> None:
1755
+ """Report HBM memory breakdown to stats reporter."""
1756
+ stats_reporter.report_data_amount(
1757
+ iteration_step=self.step,
1758
+ event_name="tbe.hbm.embeddings",
1759
+ data_bytes=embeddings,
1760
+ embedding_id=self.logging_table_name,
1761
+ tbe_id=self.uuid,
1762
+ )
1763
+ stats_reporter.report_data_amount(
1764
+ iteration_step=self.step,
1765
+ event_name="tbe.hbm.optimizer_states",
1766
+ data_bytes=optimizer_states,
1767
+ embedding_id=self.logging_table_name,
1768
+ tbe_id=self.uuid,
1769
+ )
1770
+ stats_reporter.report_data_amount(
1771
+ iteration_step=self.step,
1772
+ event_name="tbe.hbm.cache",
1773
+ data_bytes=cache,
1774
+ embedding_id=self.logging_table_name,
1775
+ tbe_id=self.uuid,
1776
+ )
1777
+ stats_reporter.report_data_amount(
1778
+ iteration_step=self.step,
1779
+ event_name="tbe.hbm.total_static_sparse",
1780
+ data_bytes=total_static_sparse,
1781
+ embedding_id=self.logging_table_name,
1782
+ tbe_id=self.uuid,
1783
+ )
1784
+ stats_reporter.report_data_amount(
1785
+ iteration_step=self.step,
1786
+ event_name="tbe.hbm.ephemeral",
1787
+ data_bytes=ephemeral,
1788
+ embedding_id=self.logging_table_name,
1789
+ tbe_id=self.uuid,
1790
+ )
1791
+
1792
+ def _report_uvm_breakdown(
1719
1793
  self,
1794
+ stats_reporter: TBEStatsReporter,
1795
+ embeddings: int,
1796
+ optimizer_states: int,
1797
+ cache: int,
1798
+ total_static_sparse: int,
1799
+ ephemeral: int,
1720
1800
  ) -> None:
1801
+ """Report UVM memory breakdown to stats reporter."""
1802
+ stats_reporter.report_data_amount(
1803
+ iteration_step=self.step,
1804
+ event_name="tbe.uvm.embeddings",
1805
+ data_bytes=embeddings,
1806
+ embedding_id=self.logging_table_name,
1807
+ tbe_id=self.uuid,
1808
+ )
1809
+ stats_reporter.report_data_amount(
1810
+ iteration_step=self.step,
1811
+ event_name="tbe.uvm.optimizer_states",
1812
+ data_bytes=optimizer_states,
1813
+ embedding_id=self.logging_table_name,
1814
+ tbe_id=self.uuid,
1815
+ )
1816
+ stats_reporter.report_data_amount(
1817
+ iteration_step=self.step,
1818
+ event_name="tbe.uvm.cache",
1819
+ data_bytes=cache,
1820
+ embedding_id=self.logging_table_name,
1821
+ tbe_id=self.uuid,
1822
+ )
1823
+ stats_reporter.report_data_amount(
1824
+ iteration_step=self.step,
1825
+ event_name="tbe.uvm.total_static_sparse",
1826
+ data_bytes=total_static_sparse,
1827
+ embedding_id=self.logging_table_name,
1828
+ tbe_id=self.uuid,
1829
+ )
1830
+ stats_reporter.report_data_amount(
1831
+ iteration_step=self.step,
1832
+ event_name="tbe.uvm.ephemeral",
1833
+ data_bytes=ephemeral,
1834
+ embedding_id=self.logging_table_name,
1835
+ tbe_id=self.uuid,
1836
+ )
1837
+
1838
+ @torch.jit.ignore
1839
+ def _report_tbe_mem_usage(self) -> None:
1721
1840
  if self.stats_reporter is None:
1722
1841
  return
1723
1842
 
@@ -1725,22 +1844,24 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1725
1844
  if not stats_reporter.should_report(self.step):
1726
1845
  return
1727
1846
 
1847
+ # Calculate total memory from all parameters and buffers (always needed)
1728
1848
  total_mem_usage = sum(
1729
- param.numel() * param.element_size() for param in self.parameters()
1730
- ) + sum(buffer.numel() * buffer.element_size() for buffer in self.buffers())
1849
+ p.numel() * p.element_size() for p in self.parameters()
1850
+ ) + sum(b.numel() * b.element_size() for b in self.buffers())
1851
+
1852
+ # Calculate total HBM and UVM usage (always needed)
1731
1853
  if self.use_cpu:
1732
1854
  total_hbm_usage = 0
1733
1855
  total_uvm_usage = total_mem_usage
1734
1856
  else:
1735
- # hbm usage is total usage minus uvm usage
1736
1857
  total_uvm_usage = sum(
1737
- getattr(self, tensor_name).numel()
1738
- * getattr(self, tensor_name).element_size()
1739
- for tensor_name in self._uvm_tensors_log
1740
- if hasattr(self, tensor_name)
1858
+ self._get_tensor_memory(name)
1859
+ for name in self._uvm_tensors_log
1860
+ if hasattr(self, name)
1741
1861
  )
1742
1862
  total_hbm_usage = total_mem_usage - total_uvm_usage
1743
1863
 
1864
+ # Report total memory usage metrics (always reported for backward compatibility)
1744
1865
  stats_reporter.report_data_amount(
1745
1866
  iteration_step=self.step,
1746
1867
  event_name="tbe.total_hbm_usage",
@@ -1756,6 +1877,76 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
1756
1877
  tbe_id=self.uuid,
1757
1878
  )
1758
1879
 
1880
+ # Check if detailed memory breakdown is enabled via environment variable
1881
+ # Set FBGEMM_TBE_MEM_BREAKDOWN=1 to enable expensive detailed breakdown
1882
+ enable_detailed_breakdown = (
1883
+ int(os.environ.get("FBGEMM_TBE_MEM_BREAKDOWN", "0")) == 1
1884
+ )
1885
+
1886
+ if not enable_detailed_breakdown:
1887
+ return
1888
+
1889
+ # Tensor groups for sparse memory categorization
1890
+ weight_tensors = ["weights_dev", "weights_host", "weights_uvm"]
1891
+ optimizer_tensors = [
1892
+ "momentum1_dev",
1893
+ "momentum1_host",
1894
+ "momentum1_uvm",
1895
+ "momentum2_dev",
1896
+ "momentum2_host",
1897
+ "momentum2_uvm",
1898
+ ]
1899
+ cache_tensors = [
1900
+ "lxu_cache_weights",
1901
+ "lxu_cache_state",
1902
+ "lxu_state",
1903
+ "cache_hash_size_cumsum",
1904
+ "cache_index_table_map",
1905
+ "cache_miss_counter",
1906
+ "lxu_cache_locking_counter",
1907
+ ]
1908
+
1909
+ # Calculate total memory for each component
1910
+ weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
1911
+ optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
1912
+ cache_total = sum(self._get_tensor_memory(t) for t in cache_tensors)
1913
+
1914
+ # Categorize memory by location (HBM vs UVM)
1915
+ if self.use_cpu:
1916
+ weights_hbm, weights_uvm = 0, weights_total
1917
+ opt_hbm, opt_uvm = 0, optimizer_total
1918
+ cache_hbm, cache_uvm = 0, cache_total
1919
+ else:
1920
+ weights_hbm, weights_uvm = self._categorize_memory_by_location(
1921
+ weight_tensors
1922
+ )
1923
+ opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
1924
+ cache_hbm, cache_uvm = self._categorize_memory_by_location(cache_tensors)
1925
+
1926
+ # Calculate ephemeral memory split between HBM and UVM
1927
+ static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
1928
+ static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
1929
+ ephemeral_hbm = total_hbm_usage - static_sparse_hbm
1930
+ ephemeral_uvm = total_uvm_usage - static_sparse_uvm
1931
+
1932
+ # Report granular memory breakdowns
1933
+ self._report_hbm_breakdown(
1934
+ stats_reporter,
1935
+ weights_hbm,
1936
+ opt_hbm,
1937
+ cache_hbm,
1938
+ static_sparse_hbm,
1939
+ ephemeral_hbm,
1940
+ )
1941
+ self._report_uvm_breakdown(
1942
+ stats_reporter,
1943
+ weights_uvm,
1944
+ opt_uvm,
1945
+ cache_uvm,
1946
+ static_sparse_uvm,
1947
+ ephemeral_uvm,
1948
+ )
1949
+
1759
1950
  @torch.jit.ignore
1760
1951
  def _report_io_size_count(self, event: str, data: Tensor) -> Tensor:
1761
1952
  if self.stats_reporter is None:
@@ -76,6 +76,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
76
76
  reverse_qparam: bool = False, # True to load qparams at end of each row; False to load qparam at begnning of each row.
77
77
  feature_names_per_table: Optional[list[list[str]]] = None,
78
78
  indices_dtype: torch.dtype = torch.int32, # Used for construction of the remap_indices tensors. Should match the dtype of the indices passed in the forward() call (INT32 or INT64).
79
+ embedding_cache_mode: bool = False, # True for zero initialization, False for randomized initialization
79
80
  ) -> None: # noqa C901 # tuple of (rows, dims,)
80
81
  super(KVEmbeddingInference, self).__init__(
81
82
  embedding_specs=embedding_specs,
@@ -114,9 +115,13 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
114
115
  num_shards = 32
115
116
  uniform_init_lower: float = -0.01
116
117
  uniform_init_upper: float = 0.01
118
+
117
119
  # pyre-fixme[4]: Attribute must be annotated.
118
120
  self.kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
119
- num_shards, uniform_init_lower, uniform_init_upper
121
+ num_shards,
122
+ uniform_init_lower,
123
+ uniform_init_upper,
124
+ embedding_cache_mode, # in embedding_cache_mode, we disable random init
120
125
  )
121
126
 
122
127
  self.specs: list[tuple[int, int, int]] = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fbgemm_gpu_genai_nightly
3
- Version: 2025.10.20
3
+ Version: 2025.10.25
4
4
  Home-page: https://github.com/pytorch/fbgemm
5
5
  Author: FBGEMM Team
6
6
  Author-email: packages@pytorch.org
@@ -1,4 +1,4 @@
1
- fbgemm_gpu/__init__.py,sha256=BxqlqUln-q_ljJpo_Cg3R2RYZqxCbZ0UjvdBe6DzNZk,6301
1
+ fbgemm_gpu/__init__.py,sha256=A3DuseilQ-sEtBpeZsG0LOqN5Cl3e5DHI_YgCZEMhnE,6283
2
2
  fbgemm_gpu/asmjit.so,sha256=tp-5cN7HUYo7cjvR_kl_vfPBSEv78-IQxdvHN-nXFAM,501728
3
3
  fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
4
4
  fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
@@ -17,7 +17,7 @@ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43m
17
17
  fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
18
18
  fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=76ME0692CC691xpjiOsY3Xxy-LD_XKs8w9vq1gcm9tM,16440
19
19
  fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
20
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=lF9eP6GDTyqbEJgl-SO6gNYUk2dv2YE2bMEtzGkY21c,173757
20
+ fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=f0sXfvkE0Wx0Rd3qTT4XmCbBK0wYgWGzhPncZEv-p48,180420
21
21
  fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=e3O9ElaWBGvG7TdT3Ok_8cB06jhskXuyCQ0t40dzsEY,5449
22
22
  fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
23
23
  fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
@@ -32,27 +32,27 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
32
32
  fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
33
33
  fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
34
34
  fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
35
- fbgemm_gpu/docs/target.genai.json.py,sha256=EffeoYnTPp4BLew_sFOpBinWQgXup1DReXuDroDTnh8,79
35
+ fbgemm_gpu/docs/target.genai.json.py,sha256=zheBID2LxrSDF8HifsFuVZUqVl4YgiUCdj1Xr8ty-O8,79
36
36
  fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
37
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=fOyUuW3hkDvgT6wxaUvCzZtj5G6pWOfQKLnjIJ5FUAg,407744
37
+ fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=25F2p_vZw0F_CgRVKMEAsbgOiaF28jJ-hc9fX5jibEY,243904
38
38
  fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
39
39
  fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
40
40
  fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=2RjIDSzUXtoFoC2ryp-C-j5H83mbSjPwvsvTrThfrqE,215658
41
- fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
41
+ fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=q1o0FfGcUAQjkxKlJjjqKVSaPd3HaBSs6L9qVHY7qKI,152924
42
42
  fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
43
43
  fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
44
44
  fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
45
45
  fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
46
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=xsfHP5BNQ6IqiCxVYYEvWfF2wTD3vSt9lYciiqm_5Nk,287360856
46
+ fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=Hv72XrRgHXgVSTpj-tTHE-AvGkywHYYeoUXdYwY0_lQ,74888976
47
47
  fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
48
48
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=ntFgFs0foi6NQx8eqs5I3fCjzKSI0spXfEWiMhlcT00,897
49
49
  fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
50
- fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=7ydkrZ6qqyiah1dlJX6EuEXXw6WwOqCj7D48PWNJcUw,9259
50
+ fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=K9cPXGOF4E9VHzuVtJjDPoTC7JjhEqS1RmmWSehQrKU,9887
51
51
  fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=XpAK_eyqDSKeFC5J9KpnKtbZG07mrDh9d2j1LFKzr-8,404
52
52
  fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
53
53
  fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
54
54
  fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py,sha256=BWl6t-4acbuRSEX2aVNDlFrSWZkqMWK2sI3VONaMd3Q,24047
55
- fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=H6AchejyZs76_snM_ae5vV0cPr_Q0h35OQ8qED0r1N4,104915
55
+ fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=cDZS2rCb1W2IEQYxsnGjauhlUhg2PFZ-9LqJ_SEdbiQ,104933
56
56
  fbgemm_gpu/experimental/gen_ai/moe/README.md,sha256=z9ybHmv4KFJ1drj5OByuFaOY0tRQwwiIW3Q22TB_2-k,904
57
57
  fbgemm_gpu/experimental/gen_ai/moe/__init__.py,sha256=lwSvff07yEav024B1XyfgW8r8hwNe--aEDywcO7rnbM,1905
58
58
  fbgemm_gpu/experimental/gen_ai/moe/activation.py,sha256=NiXhWyCNagI3P9N3N89iSX7xKuShdkq9DxEUAzoV6y0,7892
@@ -94,7 +94,7 @@ fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=MNddYzoRlu0mNhnsVVG57JN7pB
94
94
  fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=sptdqcNE9JlgyIJ17neZaMxagKG469_ynX0mVx_JKBY,6090
95
95
  fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
96
96
  fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
97
- fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=m8rCF8bc_5vBrg9677TDZTQXqRdFt6YPUVVKv85up5s,14380
97
+ fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
98
98
  fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
99
99
  fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
100
100
  fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
121
121
  fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
122
122
  list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
123
123
  list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
124
- fbgemm_gpu_genai_nightly-2025.10.20.dist-info/METADATA,sha256=YSlW54hUiRgcqmN2NJaqU8mF-KLXXKCF_MpwTI2USC0,2656
125
- fbgemm_gpu_genai_nightly-2025.10.20.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
126
- fbgemm_gpu_genai_nightly-2025.10.20.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
- fbgemm_gpu_genai_nightly-2025.10.20.dist-info/RECORD,,
124
+ fbgemm_gpu_genai_nightly-2025.10.25.dist-info/METADATA,sha256=nAiko7_2Se0u8j18sS-uwInAjpc9TEsVEq0Jn_YNmi4,2656
125
+ fbgemm_gpu_genai_nightly-2025.10.25.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
126
+ fbgemm_gpu_genai_nightly-2025.10.25.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
127
+ fbgemm_gpu_genai_nightly-2025.10.25.dist-info/RECORD,,