PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.10.20__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.10.25__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.10.20__cp312-cp312-manylinux_2_28_x86_64.whl → 2025.10.25__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (13) hide show

fbgemm_gpu/__init__.py CHANGED Viewed

@@ -15,7 +15,6 @@ import torch
 # Based on the FBGEMM-PyTorch compatibility table at
 # https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
 _fbgemm_torch_compat_table = {
-    "1.4": "2.9",
     "1.3": "2.8",
     "1.2": "2.7",
     "1.1": "2.6",

fbgemm_gpu/docs/target.genai.json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "version": "2025.10.20",
+    "version": "2025.10.25",
     "target": "genai",
     "variant": "cuda"
 }

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py CHANGED Viewed

@@ -3840,6 +3840,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
     (256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
     (128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
     (256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
+    (128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
+    (128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
+    (128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
+    (128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
 ]

fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py CHANGED Viewed

@@ -61,6 +61,8 @@ def _cutlass_blackwell_fmha_forward(
     softmax_scale: float | None = None,
     causal: bool = False,
     seqlen_kv: torch.Tensor | None = None,
+    page_table: torch.Tensor | None = None,
+    seqlen_k: int | None = None,
     window_left: int = -1,
     window_right: int = -1,
     bottom_right: bool = True,
@@ -79,6 +81,8 @@ def _cutlass_blackwell_fmha_forward(
         softmax_scale=softmax_scale,
         causal=causal,
         seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
         window_size_left=window_left,
         window_size_right=window_right,
         bottom_right=bottom_right,
@@ -171,6 +175,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
         max_seq_len_q: Optional[int] = None,
         max_seq_len_k: Optional[int] = None,
         seqlen_kv: Optional[torch.Tensor] = None,
+        page_table: Optional[torch.Tensor] = None,
+        seqlen_k: Optional[int] = None,
         window_size: tuple[int, int] = (-1, -1),
         bottom_right: bool = True,
         deterministic: bool = False,
@@ -220,6 +226,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
                 softmax_scale,
                 causal,
                 seqlen_kv,
+                page_table,
+                seqlen_k,
                 window_left,
                 window_right,
                 bottom_right,
@@ -252,6 +260,8 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
         None,
         None,
         None,
+        None,
+        None,
     ]:
         if ctx.is_gen:
             # For gen case, no backward pass is needed (generation is inference only)
@@ -279,7 +289,23 @@ class CutlassBlackwellFmhaFunc(torch.autograd.Function):
             bottom_right=ctx.bottom_right,
             deterministic=ctx.deterministic,
         )
-        return dq, dk, dv, None, None, None, None, None, None, None, None, None, None
+        return (
+            dq,
+            dk,
+            dv,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
 def cutlass_blackwell_fmha_func(
@@ -293,6 +319,8 @@ def cutlass_blackwell_fmha_func(
     max_seq_len_q: int | None = None,
     max_seq_len_k: int | None = None,
     seqlen_kv: torch.Tensor | None = None,
+    page_table: torch.Tensor | None = None,
+    seqlen_k: int | None = None,
     window_size: tuple[int, int] | None = (-1, -1),
     bottom_right: bool = True,
     deterministic: bool = False,
@@ -308,6 +336,8 @@ def cutlass_blackwell_fmha_func(
         max_seq_len_q,
         max_seq_len_k,
         seqlen_kv,
+        page_table,
+        seqlen_k,
         window_size,
         bottom_right,
         deterministic,

fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py CHANGED Viewed

@@ -175,7 +175,7 @@ class QuantizeOpBase(metaclass=abc.ABCMeta):
             if use_cuda_graph:
                 with torch.cuda.stream(torch.cuda.Stream()):
                     t = triton.testing.do_bench_cudagraph(
-                        lambda: self.quantize_and_compute(*args, **kwargs)
+                        lambda: self.quantize_and_compute(*args, **kwargs), rep=200
                     )
             else:
                 t = triton.testing.do_bench(
@@ -188,7 +188,7 @@ class QuantizeOpBase(metaclass=abc.ABCMeta):
                 if use_cuda_graph:
                     with torch.cuda.stream(torch.cuda.Stream()):
                         t = triton.testing.do_bench_cudagraph(
-                            lambda: self.compute(*args, **kwargs)
+                            lambda: self.compute(*args, **kwargs), rep=200
                         )
                 else:
                     t = triton.testing.do_bench(lambda: self.compute(*args, **kwargs))

fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so CHANGED Viewed

Binary file

fbgemm_gpu/split_table_batched_embeddings_ops_training.py CHANGED Viewed

@@ -1714,10 +1714,129 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             tbe_id=self.uuid,
         )
-    @torch.jit.ignore
-    def _report_tbe_mem_usage(
+    def _get_tensor_memory(self, tensor_name: str) -> int:
+        """Get memory usage of a tensor in bytes."""
+        if not hasattr(self, tensor_name):
+            self.log(f"Tensor '{tensor_name}' not found, using 0 bytes")
+            return 0
+        tensor = getattr(self, tensor_name)
+        return tensor.numel() * tensor.element_size()
+    def _categorize_memory_by_location(
+        self, tensor_names: list[str]
+    ) -> tuple[int, int]:
+        """Categorize memory into HBM and UVM for given tensors.
+        Returns:
+            (hbm_bytes, uvm_bytes)
+        """
+        uvm_set = set(self._uvm_tensors_log)
+        hbm_bytes = 0
+        uvm_bytes = 0
+        for name in tensor_names:
+            size = self._get_tensor_memory(name)
+            if name in uvm_set:
+                uvm_bytes += size
+            else:
+                hbm_bytes += size
+        return hbm_bytes, uvm_bytes
+    def _report_hbm_breakdown(
+        self,
+        stats_reporter: TBEStatsReporter,
+        embeddings: int,
+        optimizer_states: int,
+        cache: int,
+        total_static_sparse: int,
+        ephemeral: int,
+    ) -> None:
+        """Report HBM memory breakdown to stats reporter."""
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.embeddings",
+            data_bytes=embeddings,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.optimizer_states",
+            data_bytes=optimizer_states,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.cache",
+            data_bytes=cache,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.total_static_sparse",
+            data_bytes=total_static_sparse,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.ephemeral",
+            data_bytes=ephemeral,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+    def _report_uvm_breakdown(
         self,
+        stats_reporter: TBEStatsReporter,
+        embeddings: int,
+        optimizer_states: int,
+        cache: int,
+        total_static_sparse: int,
+        ephemeral: int,
     ) -> None:
+        """Report UVM memory breakdown to stats reporter."""
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.embeddings",
+            data_bytes=embeddings,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.optimizer_states",
+            data_bytes=optimizer_states,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.cache",
+            data_bytes=cache,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.total_static_sparse",
+            data_bytes=total_static_sparse,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.ephemeral",
+            data_bytes=ephemeral,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+    @torch.jit.ignore
+    def _report_tbe_mem_usage(self) -> None:
         if self.stats_reporter is None:
             return
@@ -1725,22 +1844,24 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         if not stats_reporter.should_report(self.step):
             return
+        # Calculate total memory from all parameters and buffers (always needed)
         total_mem_usage = sum(
-            param.numel() * param.element_size() for param in self.parameters()
-        ) + sum(buffer.numel() * buffer.element_size() for buffer in self.buffers())
+            p.numel() * p.element_size() for p in self.parameters()
+        ) + sum(b.numel() * b.element_size() for b in self.buffers())
+        # Calculate total HBM and UVM usage (always needed)
         if self.use_cpu:
             total_hbm_usage = 0
             total_uvm_usage = total_mem_usage
         else:
-            # hbm usage is total usage minus uvm usage
             total_uvm_usage = sum(
-                getattr(self, tensor_name).numel()
-                * getattr(self, tensor_name).element_size()
-                for tensor_name in self._uvm_tensors_log
-                if hasattr(self, tensor_name)
+                self._get_tensor_memory(name)
+                for name in self._uvm_tensors_log
+                if hasattr(self, name)
             )
             total_hbm_usage = total_mem_usage - total_uvm_usage
+        # Report total memory usage metrics (always reported for backward compatibility)
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="tbe.total_hbm_usage",
@@ -1756,6 +1877,76 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             tbe_id=self.uuid,
         )
+        # Check if detailed memory breakdown is enabled via environment variable
+        # Set FBGEMM_TBE_MEM_BREAKDOWN=1 to enable expensive detailed breakdown
+        enable_detailed_breakdown = (
+            int(os.environ.get("FBGEMM_TBE_MEM_BREAKDOWN", "0")) == 1
+        )
+        if not enable_detailed_breakdown:
+            return
+        # Tensor groups for sparse memory categorization
+        weight_tensors = ["weights_dev", "weights_host", "weights_uvm"]
+        optimizer_tensors = [
+            "momentum1_dev",
+            "momentum1_host",
+            "momentum1_uvm",
+            "momentum2_dev",
+            "momentum2_host",
+            "momentum2_uvm",
+        ]
+        cache_tensors = [
+            "lxu_cache_weights",
+            "lxu_cache_state",
+            "lxu_state",
+            "cache_hash_size_cumsum",
+            "cache_index_table_map",
+            "cache_miss_counter",
+            "lxu_cache_locking_counter",
+        ]
+        # Calculate total memory for each component
+        weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
+        optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
+        cache_total = sum(self._get_tensor_memory(t) for t in cache_tensors)
+        # Categorize memory by location (HBM vs UVM)
+        if self.use_cpu:
+            weights_hbm, weights_uvm = 0, weights_total
+            opt_hbm, opt_uvm = 0, optimizer_total
+            cache_hbm, cache_uvm = 0, cache_total
+        else:
+            weights_hbm, weights_uvm = self._categorize_memory_by_location(
+                weight_tensors
+            )
+            opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
+            cache_hbm, cache_uvm = self._categorize_memory_by_location(cache_tensors)
+        # Calculate ephemeral memory split between HBM and UVM
+        static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
+        static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
+        ephemeral_hbm = total_hbm_usage - static_sparse_hbm
+        ephemeral_uvm = total_uvm_usage - static_sparse_uvm
+        # Report granular memory breakdowns
+        self._report_hbm_breakdown(
+            stats_reporter,
+            weights_hbm,
+            opt_hbm,
+            cache_hbm,
+            static_sparse_hbm,
+            ephemeral_hbm,
+        )
+        self._report_uvm_breakdown(
+            stats_reporter,
+            weights_uvm,
+            opt_uvm,
+            cache_uvm,
+            static_sparse_uvm,
+            ephemeral_uvm,
+        )
     @torch.jit.ignore
     def _report_io_size_count(self, event: str, data: Tensor) -> Tensor:
         if self.stats_reporter is None:

fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py CHANGED Viewed

@@ -76,6 +76,7 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
         reverse_qparam: bool = False,  # True to load qparams at end of each row; False to load qparam at begnning of each row.
         feature_names_per_table: Optional[list[list[str]]] = None,
         indices_dtype: torch.dtype = torch.int32,  # Used for construction of the remap_indices tensors.  Should match the dtype of the indices passed in the forward() call (INT32 or INT64).
+        embedding_cache_mode: bool = False,  # True for zero initialization, False for randomized initialization
     ) -> None:  # noqa C901  # tuple of (rows, dims,)
         super(KVEmbeddingInference, self).__init__(
             embedding_specs=embedding_specs,
@@ -114,9 +115,13 @@ class KVEmbeddingInference(IntNBitTableBatchedEmbeddingBagsCodegen):
         num_shards = 32
         uniform_init_lower: float = -0.01
         uniform_init_upper: float = 0.01
         # pyre-fixme[4]: Attribute must be annotated.
         self.kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
-            num_shards, uniform_init_lower, uniform_init_upper
+            num_shards,
+            uniform_init_lower,
+            uniform_init_upper,
+            embedding_cache_mode,  # in embedding_cache_mode, we disable random init
         )
         self.specs: list[tuple[int, int, int]] = [

{fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_genai_nightly
-Version: 2025.10.20
+Version: 2025.10.25
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org

{fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-fbgemm_gpu/__init__.py,sha256=BxqlqUln-q_ljJpo_Cg3R2RYZqxCbZ0UjvdBe6DzNZk,6301
+fbgemm_gpu/__init__.py,sha256=A3DuseilQ-sEtBpeZsG0LOqN5Cl3e5DHI_YgCZEMhnE,6283
 fbgemm_gpu/asmjit.so,sha256=tp-5cN7HUYo7cjvR_kl_vfPBSEv78-IQxdvHN-nXFAM,501728
 fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
 fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
@@ -17,7 +17,7 @@ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43m
 fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
 fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=76ME0692CC691xpjiOsY3Xxy-LD_XKs8w9vq1gcm9tM,16440
 fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
-fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=lF9eP6GDTyqbEJgl-SO6gNYUk2dv2YE2bMEtzGkY21c,173757
+fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=f0sXfvkE0Wx0Rd3qTT4XmCbBK0wYgWGzhPncZEv-p48,180420
 fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=e3O9ElaWBGvG7TdT3Ok_8cB06jhskXuyCQ0t40dzsEY,5449
 fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
 fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
@@ -32,27 +32,27 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
-fbgemm_gpu/docs/target.genai.json.py,sha256=EffeoYnTPp4BLew_sFOpBinWQgXup1DReXuDroDTnh8,79
+fbgemm_gpu/docs/target.genai.json.py,sha256=zheBID2LxrSDF8HifsFuVZUqVl4YgiUCdj1Xr8ty-O8,79
 fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
-fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=fOyUuW3hkDvgT6wxaUvCzZtj5G6pWOfQKLnjIJ5FUAg,407744
+fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=25F2p_vZw0F_CgRVKMEAsbgOiaF28jJ-hc9fX5jibEY,243904
 fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
 fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
 fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=2RjIDSzUXtoFoC2ryp-C-j5H83mbSjPwvsvTrThfrqE,215658
-fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=5m4SdgUsf2rM_Vul8czgRn_5oVnyi-52TmeidXh05hg,152754
+fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=q1o0FfGcUAQjkxKlJjjqKVSaPd3HaBSs6L9qVHY7qKI,152924
 fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=rbjxTMefjQWgJrWK_bYFtBklJigFwv4awPeVexkkiIA,44511
 fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
 fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
 fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
-fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=xsfHP5BNQ6IqiCxVYYEvWfF2wTD3vSt9lYciiqm_5Nk,287360856
+fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=Hv72XrRgHXgVSTpj-tTHE-AvGkywHYYeoUXdYwY0_lQ,74888976
 fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=ntFgFs0foi6NQx8eqs5I3fCjzKSI0spXfEWiMhlcT00,897
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=FADVTYzS2u8fA-3iChS5CbtWd0mWF8F3lnXcwr_7vDw,7821
-fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=7ydkrZ6qqyiah1dlJX6EuEXXw6WwOqCj7D48PWNJcUw,9259
+fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=K9cPXGOF4E9VHzuVtJjDPoTC7JjhEqS1RmmWSehQrKU,9887
 fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=XpAK_eyqDSKeFC5J9KpnKtbZG07mrDh9d2j1LFKzr-8,404
 fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
 fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
 fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py,sha256=BWl6t-4acbuRSEX2aVNDlFrSWZkqMWK2sI3VONaMd3Q,24047
-fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=H6AchejyZs76_snM_ae5vV0cPr_Q0h35OQ8qED0r1N4,104915
+fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=cDZS2rCb1W2IEQYxsnGjauhlUhg2PFZ-9LqJ_SEdbiQ,104933
 fbgemm_gpu/experimental/gen_ai/moe/README.md,sha256=z9ybHmv4KFJ1drj5OByuFaOY0tRQwwiIW3Q22TB_2-k,904
 fbgemm_gpu/experimental/gen_ai/moe/__init__.py,sha256=lwSvff07yEav024B1XyfgW8r8hwNe--aEDywcO7rnbM,1905
 fbgemm_gpu/experimental/gen_ai/moe/activation.py,sha256=NiXhWyCNagI3P9N3N89iSX7xKuShdkq9DxEUAzoV6y0,7892
@@ -94,7 +94,7 @@ fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=MNddYzoRlu0mNhnsVVG57JN7pB
 fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=sptdqcNE9JlgyIJ17neZaMxagKG469_ynX0mVx_JKBY,6090
 fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
 fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
-fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=m8rCF8bc_5vBrg9677TDZTQXqRdFt6YPUVVKv85up5s,14380
+fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
 fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
 fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
 fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
@@ -121,7 +121,7 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
 fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
 list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
 list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
-fbgemm_gpu_genai_nightly-2025.10.20.dist-info/METADATA,sha256=YSlW54hUiRgcqmN2NJaqU8mF-KLXXKCF_MpwTI2USC0,2656
-fbgemm_gpu_genai_nightly-2025.10.20.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
-fbgemm_gpu_genai_nightly-2025.10.20.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
-fbgemm_gpu_genai_nightly-2025.10.20.dist-info/RECORD,,
+fbgemm_gpu_genai_nightly-2025.10.25.dist-info/METADATA,sha256=nAiko7_2Se0u8j18sS-uwInAjpc9TEsVEq0Jn_YNmi4,2656
+fbgemm_gpu_genai_nightly-2025.10.25.dist-info/WHEEL,sha256=vUT1hK8fT5m5CAs5kDyQ_ABrvCmtd0TCp5-4vN9tR5A,108
+fbgemm_gpu_genai_nightly-2025.10.25.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
+fbgemm_gpu_genai_nightly-2025.10.25.dist-info/RECORD,,

{fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_genai_nightly-2025.10.20.dist-info → fbgemm_gpu_genai_nightly-2025.10.25.dist-info}/top_level.txt RENAMED Viewed

File without changes