fbgemm-gpu-genai-nightly 2025.10.20__cp311-cp311-manylinux_2_28_x86_64.whl → 2025.10.26__cp311-cp311-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.
- fbgemm_gpu/__init__.py +0 -1
- fbgemm_gpu/asmjit.so +0 -0
- fbgemm_gpu/docs/target.genai.json.py +1 -1
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
- fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4 -0
- fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +31 -1
- fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +2 -2
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
- fbgemm_gpu/fbgemm.so +0 -0
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py +200 -9
- fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +6 -1
- {fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.26.dist-info}/METADATA +1 -1
- {fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.26.dist-info}/RECORD +15 -15
- {fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.26.dist-info}/WHEEL +0 -0
- {fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.26.dist-info}/top_level.txt +0 -0
fbgemm_gpu/__init__.py
CHANGED
fbgemm_gpu/asmjit.so
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -3840,6 +3840,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
|
|
|
3840
3840
|
(256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
|
|
3841
3841
|
(128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
|
|
3842
3842
|
(256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
|
|
3843
|
+
(128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
|
|
3844
|
+
(128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
|
|
3845
|
+
(128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
|
|
3846
|
+
(128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
|
|
3843
3847
|
]
|
|
3844
3848
|
|
|
3845
3849
|
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py
CHANGED
|
@@ -61,6 +61,8 @@ def _cutlass_blackwell_fmha_forward(
|
|
|
61
61
|
softmax_scale: float | None = None,
|
|
62
62
|
causal: bool = False,
|
|
63
63
|
seqlen_kv: torch.Tensor | None = None,
|
|
64
|
+
page_table: torch.Tensor | None = None,
|
|
65
|
+
seqlen_k: int | None = None,
|
|
64
66
|
window_left: int = -1,
|
|
65
67
|
window_right: int = -1,
|
|
66
68
|
bottom_right: bool = True,
|
|
@@ -79,6 +81,8 @@ def _cutlass_blackwell_fmha_forward(
|
|
|
79
81
|
softmax_scale=softmax_scale,
|
|
80
82
|
causal=causal,
|
|
81
83
|
seqlen_kv=seqlen_kv,
|
|
84
|
+
page_table=page_table,
|
|
85
|
+
seqlen_k=seqlen_k,
|
|
82
86
|
window_size_left=window_left,
|
|
83
87
|
window_size_right=window_right,
|
|
84
88
|
bottom_right=bottom_right,
|
|
@@ -171,6 +175,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
171
175
|
max_seq_len_q: Optional[int] = None,
|
|
172
176
|
max_seq_len_k: Optional[int] = None,
|
|
173
177
|
seqlen_kv: Optional[torch.Tensor] = None,
|
|
178
|
+
page_table: Optional[torch.Tensor] = None,
|
|
179
|
+
seqlen_k: Optional[int] = None,
|
|
174
180
|
window_size: tuple[int, int] = (-1, -1),
|
|
175
181
|
bottom_right: bool = True,
|
|
176
182
|
deterministic: bool = False,
|
|
@@ -220,6 +226,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
220
226
|
softmax_scale,
|
|
221
227
|
causal,
|
|
222
228
|
seqlen_kv,
|
|
229
|
+
page_table,
|
|
230
|
+
seqlen_k,
|
|
223
231
|
window_left,
|
|
224
232
|
window_right,
|
|
225
233
|
bottom_right,
|
|
@@ -252,6 +260,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
252
260
|
None,
|
|
253
261
|
None,
|
|
254
262
|
None,
|
|
263
|
+
None,
|
|
264
|
+
None,
|
|
255
265
|
]:
|
|
256
266
|
if ctx.is_gen:
|
|
257
267
|
# For gen case, no backward pass is needed (generation is inference only)
|
|
@@ -279,7 +289,23 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
279
289
|
bottom_right=ctx.bottom_right,
|
|
280
290
|
deterministic=ctx.deterministic,
|
|
281
291
|
)
|
|
282
|
-
return
|
|
292
|
+
return (
|
|
293
|
+
dq,
|
|
294
|
+
dk,
|
|
295
|
+
dv,
|
|
296
|
+
None,
|
|
297
|
+
None,
|
|
298
|
+
None,
|
|
299
|
+
None,
|
|
300
|
+
None,
|
|
301
|
+
None,
|
|
302
|
+
None,
|
|
303
|
+
None,
|
|
304
|
+
None,
|
|
305
|
+
None,
|
|
306
|
+
None,
|
|
307
|
+
None,
|
|
308
|
+
)
|
|
283
309
|
|
|
284
310
|
|
|
285
311
|
def cutlass_blackwell_fmha_func(
|
|
@@ -293,6 +319,8 @@ def cutlass_blackwell_fmha_func(
|
|
|
293
319
|
max_seq_len_q: int | None = None,
|
|
294
320
|
max_seq_len_k: int | None = None,
|
|
295
321
|
seqlen_kv: torch.Tensor | None = None,
|
|
322
|
+
page_table: torch.Tensor | None = None,
|
|
323
|
+
seqlen_k: int | None = None,
|
|
296
324
|
window_size: tuple[int, int] | None = (-1, -1),
|
|
297
325
|
bottom_right: bool = True,
|
|
298
326
|
deterministic: bool = False,
|
|
@@ -308,6 +336,8 @@ def cutlass_blackwell_fmha_func(
|
|
|
308
336
|
max_seq_len_q,
|
|
309
337
|
max_seq_len_k,
|
|
310
338
|
seqlen_kv,
|
|
339
|
+
page_table,
|
|
340
|
+
seqlen_k,
|
|
311
341
|
window_size,
|
|
312
342
|
bottom_right,
|
|
313
343
|
deterministic,
|
|
@@ -175,7 +175,7 @@ class QuantizeOpBase(metaclass=abc.ABCMeta):
|
|
|
175
175
|
if use_cuda_graph:
|
|
176
176
|
with torch.cuda.stream(torch.cuda.Stream()):
|
|
177
177
|
t = triton.testing.do_bench_cudagraph(
|
|
178
|
-
lambda: self.quantize_and_compute(*args, **kwargs)
|
|
178
|
+
lambda: self.quantize_and_compute(*args, **kwargs), rep=200
|
|
179
179
|
)
|
|
180
180
|
else:
|
|
181
181
|
t = triton.testing.do_bench(
|
|
@@ -188,7 +188,7 @@ class QuantizeOpBase(metaclass=abc.ABCMeta):
|
|
|
188
188
|
if use_cuda_graph:
|
|
189
189
|
with torch.cuda.stream(torch.cuda.Stream()):
|
|
190
190
|
t = triton.testing.do_bench_cudagraph(
|
|
191
|
-
lambda: self.compute(*args, **kwargs)
|
|
191
|
+
lambda: self.compute(*args, **kwargs), rep=200
|
|
192
192
|
)
|
|
193
193
|
else:
|
|
194
194
|
t = triton.testing.do_bench(lambda: self.compute(*args, **kwargs))
|
|
Binary file
|
fbgemm_gpu/fbgemm.so
CHANGED
|
Binary file
|
|
@@ -1714,10 +1714,129 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1714
1714
|
tbe_id=self.uuid,
|
|
1715
1715
|
)
|
|
1716
1716
|
|
|
1717
|
-
|
|
1718
|
-
|
|
1717
|
+
def _get_tensor_memory(self, tensor_name: str) -> int:
|
|
1718
|
+
"""Get memory usage of a tensor in bytes."""
|
|
1719
|
+
if not hasattr(self, tensor_name):
|
|
1720
|
+
self.log(f"Tensor '{tensor_name}' not found, using 0 bytes")
|
|
1721
|
+
return 0
|
|
1722
|
+
tensor = getattr(self, tensor_name)
|
|
1723
|
+
return tensor.numel() * tensor.element_size()
|
|
1724
|
+
|
|
1725
|
+
def _categorize_memory_by_location(
|
|
1726
|
+
self, tensor_names: list[str]
|
|
1727
|
+
) -> tuple[int, int]:
|
|
1728
|
+
"""Categorize memory into HBM and UVM for given tensors.
|
|
1729
|
+
|
|
1730
|
+
Returns:
|
|
1731
|
+
(hbm_bytes, uvm_bytes)
|
|
1732
|
+
"""
|
|
1733
|
+
uvm_set = set(self._uvm_tensors_log)
|
|
1734
|
+
hbm_bytes = 0
|
|
1735
|
+
uvm_bytes = 0
|
|
1736
|
+
|
|
1737
|
+
for name in tensor_names:
|
|
1738
|
+
size = self._get_tensor_memory(name)
|
|
1739
|
+
if name in uvm_set:
|
|
1740
|
+
uvm_bytes += size
|
|
1741
|
+
else:
|
|
1742
|
+
hbm_bytes += size
|
|
1743
|
+
|
|
1744
|
+
return hbm_bytes, uvm_bytes
|
|
1745
|
+
|
|
1746
|
+
def _report_hbm_breakdown(
|
|
1747
|
+
self,
|
|
1748
|
+
stats_reporter: TBEStatsReporter,
|
|
1749
|
+
embeddings: int,
|
|
1750
|
+
optimizer_states: int,
|
|
1751
|
+
cache: int,
|
|
1752
|
+
total_static_sparse: int,
|
|
1753
|
+
ephemeral: int,
|
|
1754
|
+
) -> None:
|
|
1755
|
+
"""Report HBM memory breakdown to stats reporter."""
|
|
1756
|
+
stats_reporter.report_data_amount(
|
|
1757
|
+
iteration_step=self.step,
|
|
1758
|
+
event_name="tbe.hbm.embeddings",
|
|
1759
|
+
data_bytes=embeddings,
|
|
1760
|
+
embedding_id=self.logging_table_name,
|
|
1761
|
+
tbe_id=self.uuid,
|
|
1762
|
+
)
|
|
1763
|
+
stats_reporter.report_data_amount(
|
|
1764
|
+
iteration_step=self.step,
|
|
1765
|
+
event_name="tbe.hbm.optimizer_states",
|
|
1766
|
+
data_bytes=optimizer_states,
|
|
1767
|
+
embedding_id=self.logging_table_name,
|
|
1768
|
+
tbe_id=self.uuid,
|
|
1769
|
+
)
|
|
1770
|
+
stats_reporter.report_data_amount(
|
|
1771
|
+
iteration_step=self.step,
|
|
1772
|
+
event_name="tbe.hbm.cache",
|
|
1773
|
+
data_bytes=cache,
|
|
1774
|
+
embedding_id=self.logging_table_name,
|
|
1775
|
+
tbe_id=self.uuid,
|
|
1776
|
+
)
|
|
1777
|
+
stats_reporter.report_data_amount(
|
|
1778
|
+
iteration_step=self.step,
|
|
1779
|
+
event_name="tbe.hbm.total_static_sparse",
|
|
1780
|
+
data_bytes=total_static_sparse,
|
|
1781
|
+
embedding_id=self.logging_table_name,
|
|
1782
|
+
tbe_id=self.uuid,
|
|
1783
|
+
)
|
|
1784
|
+
stats_reporter.report_data_amount(
|
|
1785
|
+
iteration_step=self.step,
|
|
1786
|
+
event_name="tbe.hbm.ephemeral",
|
|
1787
|
+
data_bytes=ephemeral,
|
|
1788
|
+
embedding_id=self.logging_table_name,
|
|
1789
|
+
tbe_id=self.uuid,
|
|
1790
|
+
)
|
|
1791
|
+
|
|
1792
|
+
def _report_uvm_breakdown(
|
|
1719
1793
|
self,
|
|
1794
|
+
stats_reporter: TBEStatsReporter,
|
|
1795
|
+
embeddings: int,
|
|
1796
|
+
optimizer_states: int,
|
|
1797
|
+
cache: int,
|
|
1798
|
+
total_static_sparse: int,
|
|
1799
|
+
ephemeral: int,
|
|
1720
1800
|
) -> None:
|
|
1801
|
+
"""Report UVM memory breakdown to stats reporter."""
|
|
1802
|
+
stats_reporter.report_data_amount(
|
|
1803
|
+
iteration_step=self.step,
|
|
1804
|
+
event_name="tbe.uvm.embeddings",
|
|
1805
|
+
data_bytes=embeddings,
|
|
1806
|
+
embedding_id=self.logging_table_name,
|
|
1807
|
+
tbe_id=self.uuid,
|
|
1808
|
+
)
|
|
1809
|
+
stats_reporter.report_data_amount(
|
|
1810
|
+
iteration_step=self.step,
|
|
1811
|
+
event_name="tbe.uvm.optimizer_states",
|
|
1812
|
+
data_bytes=optimizer_states,
|
|
1813
|
+
embedding_id=self.logging_table_name,
|
|
1814
|
+
tbe_id=self.uuid,
|
|
1815
|
+
)
|
|
1816
|
+
stats_reporter.report_data_amount(
|
|
1817
|
+
iteration_step=self.step,
|
|
1818
|
+
event_name="tbe.uvm.cache",
|
|
1819
|
+
data_bytes=cache,
|
|
1820
|
+
embedding_id=self.logging_table_name,
|
|
1821
|
+
tbe_id=self.uuid,
|
|
1822
|
+
)
|
|
1823
|
+
stats_reporter.report_data_amount(
|
|
1824
|
+
iteration_step=self.step,
|
|
1825
|
+
event_name="tbe.uvm.total_static_sparse",
|
|
1826
|
+
data_bytes=total_static_sparse,
|
|
1827
|
+
embedding_id=self.logging_table_name,
|
|
1828
|
+
tbe_id=self.uuid,
|
|
1829
|
+
)
|
|
1830
|
+
stats_reporter.report_data_amount(
|
|
1831
|
+
iteration_step=self.step,
|
|
1832
|
+
event_name="tbe.uvm.ephemeral",
|
|
1833
|
+
data_bytes=ephemeral,
|
|
1834
|
+
embedding_id=self.logging_table_name,
|
|
1835
|
+
tbe_id=self.uuid,
|
|
1836
|
+
)
|
|
1837
|
+
|
|
1838
|
+
@torch.jit.ignore
|
|
1839
|
+
def _report_tbe_mem_usage(self) -> None:
|
|
1721
1840
|
if self.stats_reporter is None:
|
|
1722
1841
|
return
|
|
1723
1842
|
|
|
@@ -1725,22 +1844,24 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1725
1844
|
if not stats_reporter.should_report(self.step):
|
|
1726
1845
|
return
|
|
1727
1846
|
|
|
1847
|
+
# Calculate total memory from all parameters and buffers (always needed)
|
|
1728
1848
|
total_mem_usage = sum(
|
|
1729
|
-
|
|
1730
|
-
) + sum(
|
|
1849
|
+
p.numel() * p.element_size() for p in self.parameters()
|
|
1850
|
+
) + sum(b.numel() * b.element_size() for b in self.buffers())
|
|
1851
|
+
|
|
1852
|
+
# Calculate total HBM and UVM usage (always needed)
|
|
1731
1853
|
if self.use_cpu:
|
|
1732
1854
|
total_hbm_usage = 0
|
|
1733
1855
|
total_uvm_usage = total_mem_usage
|
|
1734
1856
|
else:
|
|
1735
|
-
# hbm usage is total usage minus uvm usage
|
|
1736
1857
|
total_uvm_usage = sum(
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
if hasattr(self, tensor_name)
|
|
1858
|
+
self._get_tensor_memory(name)
|
|
1859
|
+
for name in self._uvm_tensors_log
|
|
1860
|
+
if hasattr(self, name)
|
|
1741
1861
|
)
|
|
1742
1862
|
total_hbm_usage = total_mem_usage - total_uvm_usage
|
|
1743
1863
|
|
|
1864
|
+
# Report total memory usage metrics (always reported for backward compatibility)
|
|
1744
1865
|
stats_reporter.report_data_amount(
|
|
1745
1866
|
iteration_step=self.step,
|
|
1746
1867
|
event_name="tbe.total_hbm_usage",
|
|
@@ -1756,6 +1877,76 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1756
1877
|
tbe_id=self.uuid,
|
|
1757
1878
|
)
|
|
1758
1879
|
|
|
1880
|
+
# Check if detailed memory breakdown is enabled via environment variable
|
|
1881
|
+
# Set FBGEMM_TBE_MEM_BREAKDOWN=1 to enable expensive detailed breakdown
|
|
1882
|
+
enable_detailed_breakdown = (
|
|
1883
|
+
int(os.environ.get("FBGEMM_TBE_MEM_BREAKDOWN", "0")) == 1
|
|
1884
|
+
)
|
|
1885
|
+
|
|
1886
|
+
if not enable_detailed_breakdown:
|
|
1887
|
+
return
|
|
1888
|
+
|
|
1889
|
+
# Tensor groups for sparse memory categorization
|
|
1890
|
+
weight_tensors = ["weights_dev", "weights_host", "weights_uvm"]
|
|
1891
|
+
optimizer_tensors = [
|
|
1892
|
+
"momentum1_dev",
|
|
1893
|
+
"momentum1_host",
|
|
1894
|
+
"momentum1_uvm",
|
|
1895
|
+
"momentum2_dev",
|
|
1896
|
+
"momentum2_host",
|
|
1897
|
+
"momentum2_uvm",
|
|
1898
|
+
]
|
|
1899
|
+
cache_tensors = [
|
|
1900
|
+
"lxu_cache_weights",
|
|
1901
|
+
"lxu_cache_state",
|
|
1902
|
+
"lxu_state",
|
|
1903
|
+
"cache_hash_size_cumsum",
|
|
1904
|
+
"cache_index_table_map",
|
|
1905
|
+
"cache_miss_counter",
|
|
1906
|
+
"lxu_cache_locking_counter",
|
|
1907
|
+
]
|
|
1908
|
+
|
|
1909
|
+
# Calculate total memory for each component
|
|
1910
|
+
weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
|
|
1911
|
+
optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
|
|
1912
|
+
cache_total = sum(self._get_tensor_memory(t) for t in cache_tensors)
|
|
1913
|
+
|
|
1914
|
+
# Categorize memory by location (HBM vs UVM)
|
|
1915
|
+
if self.use_cpu:
|
|
1916
|
+
weights_hbm, weights_uvm = 0, weights_total
|
|
1917
|
+
opt_hbm, opt_uvm = 0, optimizer_total
|
|
1918
|
+
cache_hbm, cache_uvm = 0, cache_total
|
|
1919
|
+
else:
|
|
1920
|
+
weights_hbm, weights_uvm = self._categorize_memory_by_location(
|
|
1921
|
+
weight_tensors
|
|
1922
|
+
)
|
|
1923
|
+
opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
|
|
1924
|
+
cache_hbm, cache_uvm = self._categorize_memory_by_location(cache_tensors)
|
|
1925
|
+
|
|
1926
|
+
# Calculate ephemeral memory split between HBM and UVM
|
|
1927
|
+
static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
|
|
1928
|
+
static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
|
|
1929
|
+
ephemeral_hbm = total_hbm_usage - static_sparse_hbm
|
|
1930
|
+
ephemeral_uvm = total_uvm_usage - static_sparse_uvm
|
|
1931
|
+
|
|
1932
|
+
# Report granular memory breakdowns
|
|
1933
|
+
self._report_hbm_breakdown(
|
|
1934
|
+
stats_reporter,
|
|
1935
|
+
weights_hbm,
|
|
1936
|
+
opt_hbm,
|
|
1937
|
+
cache_hbm,
|
|
1938
|
+
static_sparse_hbm,
|
|
1939
|
+
ephemeral_hbm,
|
|
1940
|
+
)
|
|
1941
|
+
self._report_uvm_breakdown(
|
|
1942
|
+
stats_reporter,
|
|
1943
|
+
weights_uvm,
|
|
1944
|
+
opt_uvm,
|
|
1945
|
+
cache_uvm,
|
|
1946
|
+
static_sparse_uvm,
|
|
1947
|
+
ephemeral_uvm,
|
|
1948
|
+
)
|
|
1949
|
+
|
|
1759
1950
|
@torch.jit.ignore
|
|
1760
1951
|
def _report_io_size_count(self, event: str, data: Tensor) -> Tensor:
|
|
1761
1952
|
if self.stats_reporter is None:
|
|
@@ -76,6 +76,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
|
|
|
76
76
|
reverse_qparam: bool = False, # True to load qparams at end of each row; False to load qparam at begnning of each row.
|
|
77
77
|
feature_names_per_table: Optional[list[list[str]]] = None,
|
|
78
78
|
indices_dtype: torch.dtype = torch.int32, # Used for construction of the remap_indices tensors. Should match the dtype of the indices passed in the forward() call (INT32 or INT64).
|
|
79
|
+
embedding_cache_mode: bool = False, # True for zero initialization, False for randomized initialization
|
|
79
80
|
) -> None: # noqa C901 # tuple of (rows, dims,)
|
|
80
81
|
super(KVEmbeddingInference, self).__init__(
|
|
81
82
|
embedding_specs=embedding_specs,
|
|
@@ -114,9 +115,13 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
|
|
|
114
115
|
num_shards = 32
|
|
115
116
|
uniform_init_lower: float = -0.01
|
|
116
117
|
uniform_init_upper: float = 0.01
|
|
118
|
+
|
|
117
119
|
# pyre-fixme[4]: Attribute must be annotated.
|
|
118
120
|
self.kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
|
|
119
|
-
num_shards,
|
|
121
|
+
num_shards,
|
|
122
|
+
uniform_init_lower,
|
|
123
|
+
uniform_init_upper,
|
|
124
|
+
embedding_cache_mode, # in embedding_cache_mode, we disable random init
|
|
120
125
|
)
|
|
121
126
|
|
|
122
127
|
self.specs: list[tuple[int, int, int]] = [
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
fbgemm_gpu/__init__.py,sha256=
|
|
2
|
-
fbgemm_gpu/asmjit.so,sha256=
|
|
1
|
+
fbgemm_gpu/__init__.py,sha256=A3DuseilQ-sEtBpeZsG0LOqN5Cl3e5DHI_YgCZEMhnE,6283
|
|
2
|
+
fbgemm_gpu/asmjit.so,sha256=RxTYI8zY4PpIBRpSKT_-U7bRIVeTRohdtRFUmLNU1tQ,501728
|
|
3
3
|
fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
|
|
4
4
|
fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
|
|
5
|
-
fbgemm_gpu/fbgemm.so,sha256=
|
|
5
|
+
fbgemm_gpu/fbgemm.so,sha256=B9y6MDLC6Ou7Bw_pT07Xfw5g5Q4j2yn9Xsp96QVpgEU,5646712
|
|
6
6
|
fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
|
|
7
7
|
fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
|
|
8
8
|
fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
|
|
@@ -17,7 +17,7 @@ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43m
|
|
|
17
17
|
fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
|
|
18
18
|
fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=76ME0692CC691xpjiOsY3Xxy-LD_XKs8w9vq1gcm9tM,16440
|
|
19
19
|
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
|
|
20
|
-
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=
|
|
20
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=f0sXfvkE0Wx0Rd3qTT4XmCbBK0wYgWGzhPncZEv-p48,180420
|
|
21
21
|
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=e3O9ElaWBGvG7TdT3Ok_8cB06jhskXuyCQ0t40dzsEY,5449
|
|
22
22
|
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
|
|
23
23
|
fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
|
|
@@ -32,27 +32,27 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
|
|
|
32
32
|
fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
|
|
33
33
|
fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
|
|
34
34
|
fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
|
|
35
|
-
fbgemm_gpu/docs/target.genai.json.py,sha256=
|
|
35
|
+
fbgemm_gpu/docs/target.genai.json.py,sha256=ajhntGyjTTpz2gTBZ0AIGOreSmrqXYTCsLcujrtNSHk,79
|
|
36
36
|
fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
|
|
37
|
-
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=
|
|
37
|
+
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=CFzw20PsjS83INRnHwrOYcWybr1SaUXcA3jod77r-LM,243904
|
|
38
38
|
fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
|
|
39
39
|
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
|
|
40
40
|
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=2RjIDSzUXtoFoC2ryp-C-j5H83mbSjPwvsvTrThfrqE,215658
|
|
41
|
-
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=
|
|
41
|
+
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=q1o0FfGcUAQjkxKlJjjqKVSaPd3HaBSs6L9qVHY7qKI,152924
|
|
42
42
|
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
|
|
43
43
|
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
|
|
44
44
|
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
|
|
45
45
|
fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
|
|
46
|
-
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=
|
|
46
|
+
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=z38Aw4HDVn4BRpK6a9g1b0PdbClb_10LiqWzn22C0hM,74888976
|
|
47
47
|
fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
|
|
48
48
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=ntFgFs0foi6NQx8eqs5I3fCjzKSI0spXfEWiMhlcT00,897
|
|
49
49
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
|
|
50
|
-
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=
|
|
50
|
+
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=K9cPXGOF4E9VHzuVtJjDPoTC7JjhEqS1RmmWSehQrKU,9887
|
|
51
51
|
fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=XpAK_eyqDSKeFC5J9KpnKtbZG07mrDh9d2j1LFKzr-8,404
|
|
52
52
|
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
|
|
53
53
|
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
|
|
54
54
|
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py,sha256=BWl6t-4acbuRSEX2aVNDlFrSWZkqMWK2sI3VONaMd3Q,24047
|
|
55
|
-
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=
|
|
55
|
+
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=cDZS2rCb1W2IEQYxsnGjauhlUhg2PFZ-9LqJ_SEdbiQ,104933
|
|
56
56
|
fbgemm_gpu/experimental/gen_ai/moe/README.md,sha256=z9ybHmv4KFJ1drj5OByuFaOY0tRQwwiIW3Q22TB_2-k,904
|
|
57
57
|
fbgemm_gpu/experimental/gen_ai/moe/__init__.py,sha256=lwSvff07yEav024B1XyfgW8r8hwNe--aEDywcO7rnbM,1905
|
|
58
58
|
fbgemm_gpu/experimental/gen_ai/moe/activation.py,sha256=NiXhWyCNagI3P9N3N89iSX7xKuShdkq9DxEUAzoV6y0,7892
|
|
@@ -94,7 +94,7 @@ fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=MNddYzoRlu0mNhnsVVG57JN7pB
|
|
|
94
94
|
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=sptdqcNE9JlgyIJ17neZaMxagKG469_ynX0mVx_JKBY,6090
|
|
95
95
|
fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
|
|
96
96
|
fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
|
|
97
|
-
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=
|
|
97
|
+
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
|
|
98
98
|
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
|
|
99
99
|
fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
|
|
100
100
|
fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
|
|
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
|
|
|
121
121
|
fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
|
|
122
122
|
list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
|
|
123
123
|
list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
|
|
124
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
125
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
126
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
127
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
124
|
+
fbgemm_gpu_genai_nightly-2025.10.26.dist-info/METADATA,sha256=26cyhltFwGI-kyhkYZONi8FZb30P1Y99MRu-nh9dD8g,2656
|
|
125
|
+
fbgemm_gpu_genai_nightly-2025.10.26.dist-info/WHEEL,sha256=V2Q6mQKbouIadCxoRjt9FQ9oKfi45-uZUcoc77zzs0M,108
|
|
126
|
+
fbgemm_gpu_genai_nightly-2025.10.26.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
|
|
127
|
+
fbgemm_gpu_genai_nightly-2025.10.26.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|