fbgemm-gpu-genai-nightly 2025.10.19__cp310-cp310-manylinux_2_28_x86_64.whl → 2025.10.25__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.
- fbgemm_gpu/docs/target.genai.json.py +1 -1
- fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4 -0
- fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +31 -1
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py +186 -0
- fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +6 -1
- {fbgemm_gpu_genai_nightly-2025.10.19.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/METADATA +1 -1
- {fbgemm_gpu_genai_nightly-2025.10.19.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/RECORD +10 -10
- {fbgemm_gpu_genai_nightly-2025.10.19.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/WHEEL +0 -0
- {fbgemm_gpu_genai_nightly-2025.10.19.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/top_level.txt +0 -0
|
@@ -3840,6 +3840,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
|
|
|
3840
3840
|
(256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
|
|
3841
3841
|
(128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
|
|
3842
3842
|
(256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
|
|
3843
|
+
(128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
|
|
3844
|
+
(128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
|
|
3845
|
+
(128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
|
|
3846
|
+
(128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
|
|
3843
3847
|
]
|
|
3844
3848
|
|
|
3845
3849
|
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py
CHANGED
|
@@ -61,6 +61,8 @@ def _cutlass_blackwell_fmha_forward(
|
|
|
61
61
|
softmax_scale: float | None = None,
|
|
62
62
|
causal: bool = False,
|
|
63
63
|
seqlen_kv: torch.Tensor | None = None,
|
|
64
|
+
page_table: torch.Tensor | None = None,
|
|
65
|
+
seqlen_k: int | None = None,
|
|
64
66
|
window_left: int = -1,
|
|
65
67
|
window_right: int = -1,
|
|
66
68
|
bottom_right: bool = True,
|
|
@@ -79,6 +81,8 @@ def _cutlass_blackwell_fmha_forward(
|
|
|
79
81
|
softmax_scale=softmax_scale,
|
|
80
82
|
causal=causal,
|
|
81
83
|
seqlen_kv=seqlen_kv,
|
|
84
|
+
page_table=page_table,
|
|
85
|
+
seqlen_k=seqlen_k,
|
|
82
86
|
window_size_left=window_left,
|
|
83
87
|
window_size_right=window_right,
|
|
84
88
|
bottom_right=bottom_right,
|
|
@@ -171,6 +175,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
171
175
|
max_seq_len_q: Optional[int] = None,
|
|
172
176
|
max_seq_len_k: Optional[int] = None,
|
|
173
177
|
seqlen_kv: Optional[torch.Tensor] = None,
|
|
178
|
+
page_table: Optional[torch.Tensor] = None,
|
|
179
|
+
seqlen_k: Optional[int] = None,
|
|
174
180
|
window_size: tuple[int, int] = (-1, -1),
|
|
175
181
|
bottom_right: bool = True,
|
|
176
182
|
deterministic: bool = False,
|
|
@@ -220,6 +226,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
220
226
|
softmax_scale,
|
|
221
227
|
causal,
|
|
222
228
|
seqlen_kv,
|
|
229
|
+
page_table,
|
|
230
|
+
seqlen_k,
|
|
223
231
|
window_left,
|
|
224
232
|
window_right,
|
|
225
233
|
bottom_right,
|
|
@@ -252,6 +260,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
252
260
|
None,
|
|
253
261
|
None,
|
|
254
262
|
None,
|
|
263
|
+
None,
|
|
264
|
+
None,
|
|
255
265
|
]:
|
|
256
266
|
if ctx.is_gen:
|
|
257
267
|
# For gen case, no backward pass is needed (generation is inference only)
|
|
@@ -279,7 +289,23 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
|
|
|
279
289
|
bottom_right=ctx.bottom_right,
|
|
280
290
|
deterministic=ctx.deterministic,
|
|
281
291
|
)
|
|
282
|
-
return
|
|
292
|
+
return (
|
|
293
|
+
dq,
|
|
294
|
+
dk,
|
|
295
|
+
dv,
|
|
296
|
+
None,
|
|
297
|
+
None,
|
|
298
|
+
None,
|
|
299
|
+
None,
|
|
300
|
+
None,
|
|
301
|
+
None,
|
|
302
|
+
None,
|
|
303
|
+
None,
|
|
304
|
+
None,
|
|
305
|
+
None,
|
|
306
|
+
None,
|
|
307
|
+
None,
|
|
308
|
+
)
|
|
283
309
|
|
|
284
310
|
|
|
285
311
|
def cutlass_blackwell_fmha_func(
|
|
@@ -293,6 +319,8 @@ def cutlass_blackwell_fmha_func(
|
|
|
293
319
|
max_seq_len_q: int | None = None,
|
|
294
320
|
max_seq_len_k: int | None = None,
|
|
295
321
|
seqlen_kv: torch.Tensor | None = None,
|
|
322
|
+
page_table: torch.Tensor | None = None,
|
|
323
|
+
seqlen_k: int | None = None,
|
|
296
324
|
window_size: tuple[int, int] | None = (-1, -1),
|
|
297
325
|
bottom_right: bool = True,
|
|
298
326
|
deterministic: bool = False,
|
|
@@ -308,6 +336,8 @@ def cutlass_blackwell_fmha_func(
|
|
|
308
336
|
max_seq_len_q,
|
|
309
337
|
max_seq_len_k,
|
|
310
338
|
seqlen_kv,
|
|
339
|
+
page_table,
|
|
340
|
+
seqlen_k,
|
|
311
341
|
window_size,
|
|
312
342
|
bottom_right,
|
|
313
343
|
deterministic,
|
|
Binary file
|
|
@@ -1722,6 +1722,119 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1722
1722
|
tensor = getattr(self, tensor_name)
|
|
1723
1723
|
return tensor.numel() * tensor.element_size()
|
|
1724
1724
|
|
|
1725
|
+
def _categorize_memory_by_location(
|
|
1726
|
+
self, tensor_names: list[str]
|
|
1727
|
+
) -> tuple[int, int]:
|
|
1728
|
+
"""Categorize memory into HBM and UVM for given tensors.
|
|
1729
|
+
|
|
1730
|
+
Returns:
|
|
1731
|
+
(hbm_bytes, uvm_bytes)
|
|
1732
|
+
"""
|
|
1733
|
+
uvm_set = set(self._uvm_tensors_log)
|
|
1734
|
+
hbm_bytes = 0
|
|
1735
|
+
uvm_bytes = 0
|
|
1736
|
+
|
|
1737
|
+
for name in tensor_names:
|
|
1738
|
+
size = self._get_tensor_memory(name)
|
|
1739
|
+
if name in uvm_set:
|
|
1740
|
+
uvm_bytes += size
|
|
1741
|
+
else:
|
|
1742
|
+
hbm_bytes += size
|
|
1743
|
+
|
|
1744
|
+
return hbm_bytes, uvm_bytes
|
|
1745
|
+
|
|
1746
|
+
def _report_hbm_breakdown(
|
|
1747
|
+
self,
|
|
1748
|
+
stats_reporter: TBEStatsReporter,
|
|
1749
|
+
embeddings: int,
|
|
1750
|
+
optimizer_states: int,
|
|
1751
|
+
cache: int,
|
|
1752
|
+
total_static_sparse: int,
|
|
1753
|
+
ephemeral: int,
|
|
1754
|
+
) -> None:
|
|
1755
|
+
"""Report HBM memory breakdown to stats reporter."""
|
|
1756
|
+
stats_reporter.report_data_amount(
|
|
1757
|
+
iteration_step=self.step,
|
|
1758
|
+
event_name="tbe.hbm.embeddings",
|
|
1759
|
+
data_bytes=embeddings,
|
|
1760
|
+
embedding_id=self.logging_table_name,
|
|
1761
|
+
tbe_id=self.uuid,
|
|
1762
|
+
)
|
|
1763
|
+
stats_reporter.report_data_amount(
|
|
1764
|
+
iteration_step=self.step,
|
|
1765
|
+
event_name="tbe.hbm.optimizer_states",
|
|
1766
|
+
data_bytes=optimizer_states,
|
|
1767
|
+
embedding_id=self.logging_table_name,
|
|
1768
|
+
tbe_id=self.uuid,
|
|
1769
|
+
)
|
|
1770
|
+
stats_reporter.report_data_amount(
|
|
1771
|
+
iteration_step=self.step,
|
|
1772
|
+
event_name="tbe.hbm.cache",
|
|
1773
|
+
data_bytes=cache,
|
|
1774
|
+
embedding_id=self.logging_table_name,
|
|
1775
|
+
tbe_id=self.uuid,
|
|
1776
|
+
)
|
|
1777
|
+
stats_reporter.report_data_amount(
|
|
1778
|
+
iteration_step=self.step,
|
|
1779
|
+
event_name="tbe.hbm.total_static_sparse",
|
|
1780
|
+
data_bytes=total_static_sparse,
|
|
1781
|
+
embedding_id=self.logging_table_name,
|
|
1782
|
+
tbe_id=self.uuid,
|
|
1783
|
+
)
|
|
1784
|
+
stats_reporter.report_data_amount(
|
|
1785
|
+
iteration_step=self.step,
|
|
1786
|
+
event_name="tbe.hbm.ephemeral",
|
|
1787
|
+
data_bytes=ephemeral,
|
|
1788
|
+
embedding_id=self.logging_table_name,
|
|
1789
|
+
tbe_id=self.uuid,
|
|
1790
|
+
)
|
|
1791
|
+
|
|
1792
|
+
def _report_uvm_breakdown(
|
|
1793
|
+
self,
|
|
1794
|
+
stats_reporter: TBEStatsReporter,
|
|
1795
|
+
embeddings: int,
|
|
1796
|
+
optimizer_states: int,
|
|
1797
|
+
cache: int,
|
|
1798
|
+
total_static_sparse: int,
|
|
1799
|
+
ephemeral: int,
|
|
1800
|
+
) -> None:
|
|
1801
|
+
"""Report UVM memory breakdown to stats reporter."""
|
|
1802
|
+
stats_reporter.report_data_amount(
|
|
1803
|
+
iteration_step=self.step,
|
|
1804
|
+
event_name="tbe.uvm.embeddings",
|
|
1805
|
+
data_bytes=embeddings,
|
|
1806
|
+
embedding_id=self.logging_table_name,
|
|
1807
|
+
tbe_id=self.uuid,
|
|
1808
|
+
)
|
|
1809
|
+
stats_reporter.report_data_amount(
|
|
1810
|
+
iteration_step=self.step,
|
|
1811
|
+
event_name="tbe.uvm.optimizer_states",
|
|
1812
|
+
data_bytes=optimizer_states,
|
|
1813
|
+
embedding_id=self.logging_table_name,
|
|
1814
|
+
tbe_id=self.uuid,
|
|
1815
|
+
)
|
|
1816
|
+
stats_reporter.report_data_amount(
|
|
1817
|
+
iteration_step=self.step,
|
|
1818
|
+
event_name="tbe.uvm.cache",
|
|
1819
|
+
data_bytes=cache,
|
|
1820
|
+
embedding_id=self.logging_table_name,
|
|
1821
|
+
tbe_id=self.uuid,
|
|
1822
|
+
)
|
|
1823
|
+
stats_reporter.report_data_amount(
|
|
1824
|
+
iteration_step=self.step,
|
|
1825
|
+
event_name="tbe.uvm.total_static_sparse",
|
|
1826
|
+
data_bytes=total_static_sparse,
|
|
1827
|
+
embedding_id=self.logging_table_name,
|
|
1828
|
+
tbe_id=self.uuid,
|
|
1829
|
+
)
|
|
1830
|
+
stats_reporter.report_data_amount(
|
|
1831
|
+
iteration_step=self.step,
|
|
1832
|
+
event_name="tbe.uvm.ephemeral",
|
|
1833
|
+
data_bytes=ephemeral,
|
|
1834
|
+
embedding_id=self.logging_table_name,
|
|
1835
|
+
tbe_id=self.uuid,
|
|
1836
|
+
)
|
|
1837
|
+
|
|
1725
1838
|
@torch.jit.ignore
|
|
1726
1839
|
def _report_tbe_mem_usage(self) -> None:
|
|
1727
1840
|
if self.stats_reporter is None:
|
|
@@ -1731,10 +1844,12 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1731
1844
|
if not stats_reporter.should_report(self.step):
|
|
1732
1845
|
return
|
|
1733
1846
|
|
|
1847
|
+
# Calculate total memory from all parameters and buffers (always needed)
|
|
1734
1848
|
total_mem_usage = sum(
|
|
1735
1849
|
p.numel() * p.element_size() for p in self.parameters()
|
|
1736
1850
|
) + sum(b.numel() * b.element_size() for b in self.buffers())
|
|
1737
1851
|
|
|
1852
|
+
# Calculate total HBM and UVM usage (always needed)
|
|
1738
1853
|
if self.use_cpu:
|
|
1739
1854
|
total_hbm_usage = 0
|
|
1740
1855
|
total_uvm_usage = total_mem_usage
|
|
@@ -1746,6 +1861,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1746
1861
|
)
|
|
1747
1862
|
total_hbm_usage = total_mem_usage - total_uvm_usage
|
|
1748
1863
|
|
|
1864
|
+
# Report total memory usage metrics (always reported for backward compatibility)
|
|
1749
1865
|
stats_reporter.report_data_amount(
|
|
1750
1866
|
iteration_step=self.step,
|
|
1751
1867
|
event_name="tbe.total_hbm_usage",
|
|
@@ -1761,6 +1877,76 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
|
|
|
1761
1877
|
tbe_id=self.uuid,
|
|
1762
1878
|
)
|
|
1763
1879
|
|
|
1880
|
+
# Check if detailed memory breakdown is enabled via environment variable
|
|
1881
|
+
# Set FBGEMM_TBE_MEM_BREAKDOWN=1 to enable expensive detailed breakdown
|
|
1882
|
+
enable_detailed_breakdown = (
|
|
1883
|
+
int(os.environ.get("FBGEMM_TBE_MEM_BREAKDOWN", "0")) == 1
|
|
1884
|
+
)
|
|
1885
|
+
|
|
1886
|
+
if not enable_detailed_breakdown:
|
|
1887
|
+
return
|
|
1888
|
+
|
|
1889
|
+
# Tensor groups for sparse memory categorization
|
|
1890
|
+
weight_tensors = ["weights_dev", "weights_host", "weights_uvm"]
|
|
1891
|
+
optimizer_tensors = [
|
|
1892
|
+
"momentum1_dev",
|
|
1893
|
+
"momentum1_host",
|
|
1894
|
+
"momentum1_uvm",
|
|
1895
|
+
"momentum2_dev",
|
|
1896
|
+
"momentum2_host",
|
|
1897
|
+
"momentum2_uvm",
|
|
1898
|
+
]
|
|
1899
|
+
cache_tensors = [
|
|
1900
|
+
"lxu_cache_weights",
|
|
1901
|
+
"lxu_cache_state",
|
|
1902
|
+
"lxu_state",
|
|
1903
|
+
"cache_hash_size_cumsum",
|
|
1904
|
+
"cache_index_table_map",
|
|
1905
|
+
"cache_miss_counter",
|
|
1906
|
+
"lxu_cache_locking_counter",
|
|
1907
|
+
]
|
|
1908
|
+
|
|
1909
|
+
# Calculate total memory for each component
|
|
1910
|
+
weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
|
|
1911
|
+
optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
|
|
1912
|
+
cache_total = sum(self._get_tensor_memory(t) for t in cache_tensors)
|
|
1913
|
+
|
|
1914
|
+
# Categorize memory by location (HBM vs UVM)
|
|
1915
|
+
if self.use_cpu:
|
|
1916
|
+
weights_hbm, weights_uvm = 0, weights_total
|
|
1917
|
+
opt_hbm, opt_uvm = 0, optimizer_total
|
|
1918
|
+
cache_hbm, cache_uvm = 0, cache_total
|
|
1919
|
+
else:
|
|
1920
|
+
weights_hbm, weights_uvm = self._categorize_memory_by_location(
|
|
1921
|
+
weight_tensors
|
|
1922
|
+
)
|
|
1923
|
+
opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
|
|
1924
|
+
cache_hbm, cache_uvm = self._categorize_memory_by_location(cache_tensors)
|
|
1925
|
+
|
|
1926
|
+
# Calculate ephemeral memory split between HBM and UVM
|
|
1927
|
+
static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
|
|
1928
|
+
static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
|
|
1929
|
+
ephemeral_hbm = total_hbm_usage - static_sparse_hbm
|
|
1930
|
+
ephemeral_uvm = total_uvm_usage - static_sparse_uvm
|
|
1931
|
+
|
|
1932
|
+
# Report granular memory breakdowns
|
|
1933
|
+
self._report_hbm_breakdown(
|
|
1934
|
+
stats_reporter,
|
|
1935
|
+
weights_hbm,
|
|
1936
|
+
opt_hbm,
|
|
1937
|
+
cache_hbm,
|
|
1938
|
+
static_sparse_hbm,
|
|
1939
|
+
ephemeral_hbm,
|
|
1940
|
+
)
|
|
1941
|
+
self._report_uvm_breakdown(
|
|
1942
|
+
stats_reporter,
|
|
1943
|
+
weights_uvm,
|
|
1944
|
+
opt_uvm,
|
|
1945
|
+
cache_uvm,
|
|
1946
|
+
static_sparse_uvm,
|
|
1947
|
+
ephemeral_uvm,
|
|
1948
|
+
)
|
|
1949
|
+
|
|
1764
1950
|
@torch.jit.ignore
|
|
1765
1951
|
def _report_io_size_count(self, event: str, data: Tensor) -> Tensor:
|
|
1766
1952
|
if self.stats_reporter is None:
|
|
@@ -76,6 +76,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
|
|
|
76
76
|
reverse_qparam: bool = False, # True to load qparams at end of each row; False to load qparam at begnning of each row.
|
|
77
77
|
feature_names_per_table: Optional[list[list[str]]] = None,
|
|
78
78
|
indices_dtype: torch.dtype = torch.int32, # Used for construction of the remap_indices tensors. Should match the dtype of the indices passed in the forward() call (INT32 or INT64).
|
|
79
|
+
embedding_cache_mode: bool = False, # True for zero initialization, False for randomized initialization
|
|
79
80
|
) -> None: # noqa C901 # tuple of (rows, dims,)
|
|
80
81
|
super(KVEmbeddingInference, self).__init__(
|
|
81
82
|
embedding_specs=embedding_specs,
|
|
@@ -114,9 +115,13 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
|
|
|
114
115
|
num_shards = 32
|
|
115
116
|
uniform_init_lower: float = -0.01
|
|
116
117
|
uniform_init_upper: float = 0.01
|
|
118
|
+
|
|
117
119
|
# pyre-fixme[4]: Attribute must be annotated.
|
|
118
120
|
self.kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
|
|
119
|
-
num_shards,
|
|
121
|
+
num_shards,
|
|
122
|
+
uniform_init_lower,
|
|
123
|
+
uniform_init_upper,
|
|
124
|
+
embedding_cache_mode, # in embedding_cache_mode, we disable random init
|
|
120
125
|
)
|
|
121
126
|
|
|
122
127
|
self.specs: list[tuple[int, int, int]] = [
|
|
@@ -17,7 +17,7 @@ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43m
|
|
|
17
17
|
fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
|
|
18
18
|
fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=76ME0692CC691xpjiOsY3Xxy-LD_XKs8w9vq1gcm9tM,16440
|
|
19
19
|
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
|
|
20
|
-
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=
|
|
20
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=f0sXfvkE0Wx0Rd3qTT4XmCbBK0wYgWGzhPncZEv-p48,180420
|
|
21
21
|
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=e3O9ElaWBGvG7TdT3Ok_8cB06jhskXuyCQ0t40dzsEY,5449
|
|
22
22
|
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
|
|
23
23
|
fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
|
|
@@ -32,22 +32,22 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
|
|
|
32
32
|
fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
|
|
33
33
|
fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
|
|
34
34
|
fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
|
|
35
|
-
fbgemm_gpu/docs/target.genai.json.py,sha256=
|
|
35
|
+
fbgemm_gpu/docs/target.genai.json.py,sha256=zheBID2LxrSDF8HifsFuVZUqVl4YgiUCdj1Xr8ty-O8,79
|
|
36
36
|
fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
|
|
37
37
|
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=PGtZj3tM9mq65PGD08gEiTlj5PsvGaqJ_VkCvveHIIk,243904
|
|
38
38
|
fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
|
|
39
39
|
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
|
|
40
40
|
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=2RjIDSzUXtoFoC2ryp-C-j5H83mbSjPwvsvTrThfrqE,215658
|
|
41
|
-
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=
|
|
41
|
+
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=q1o0FfGcUAQjkxKlJjjqKVSaPd3HaBSs6L9qVHY7qKI,152924
|
|
42
42
|
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
|
|
43
43
|
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
|
|
44
44
|
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
|
|
45
45
|
fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
|
|
46
|
-
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=
|
|
46
|
+
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=634_cv9QwuB1pBLoHNdY0zL57T0ByunmBUnFM795WOQ,74888976
|
|
47
47
|
fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
|
|
48
48
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=ntFgFs0foi6NQx8eqs5I3fCjzKSI0spXfEWiMhlcT00,897
|
|
49
49
|
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
|
|
50
|
-
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=
|
|
50
|
+
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=K9cPXGOF4E9VHzuVtJjDPoTC7JjhEqS1RmmWSehQrKU,9887
|
|
51
51
|
fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=XpAK_eyqDSKeFC5J9KpnKtbZG07mrDh9d2j1LFKzr-8,404
|
|
52
52
|
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
|
|
53
53
|
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
|
|
@@ -94,7 +94,7 @@ fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=MNddYzoRlu0mNhnsVVG57JN7pB
|
|
|
94
94
|
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=sptdqcNE9JlgyIJ17neZaMxagKG469_ynX0mVx_JKBY,6090
|
|
95
95
|
fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
|
|
96
96
|
fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
|
|
97
|
-
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=
|
|
97
|
+
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
|
|
98
98
|
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
|
|
99
99
|
fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
|
|
100
100
|
fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
|
|
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
|
|
|
121
121
|
fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
|
|
122
122
|
list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
|
|
123
123
|
list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
|
|
124
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
125
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
126
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
127
|
-
fbgemm_gpu_genai_nightly-2025.10.
|
|
124
|
+
fbgemm_gpu_genai_nightly-2025.10.25.dist-info/METADATA,sha256=nAiko7_2Se0u8j18sS-uwInAjpc9TEsVEq0Jn_YNmi4,2656
|
|
125
|
+
fbgemm_gpu_genai_nightly-2025.10.25.dist-info/WHEEL,sha256=k9CVMKlTmOLLXq_OyiiJFbPd6UKfogV4yIUezgPmplE,108
|
|
126
|
+
fbgemm_gpu_genai_nightly-2025.10.25.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
|
|
127
|
+
fbgemm_gpu_genai_nightly-2025.10.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|