PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.17__cp313-cp313-manylinux_2_28_x86_64.whl → 2026.1.4__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.17__cp313-cp313-manylinux_2_28_x86_64.whl → 2026.1.4__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

fbgemm_gpu/asmjit.so CHANGED Viewed

Binary file

fbgemm_gpu/config/feature_list.py CHANGED Viewed

@@ -63,6 +63,9 @@ class FeatureGateName(Enum):
     # Enable TBE input parameters extraction
     TBE_REPORT_INPUT_PARAMS = auto()
+    # Enable tuned max segment length per CTA for B200
+    TBE_USE_TUNED_SEGMENT_LENGTHS_CTA_B200 = auto()
     def is_enabled(self) -> bool:
         return FeatureGate.is_enabled(self)

fbgemm_gpu/docs/target.genai.json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "version": "2025.12.17",
+    "version": "2026.1.4",
     "target": "genai",
     "variant": "cuda"
 }

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py CHANGED Viewed

@@ -289,7 +289,7 @@ def cutlass_blackwell_fmha_decode_forward(
     window_left: int = -1,
     window_right: int = -1,
     bottom_right: bool = True,
-    split_k_size: int = 1024,
+    split_k_size: int = 0,
     use_heuristic: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -318,14 +318,9 @@ def cutlass_blackwell_fmha_decode_forward(
                       split size using the heuristic. Default is True.
     Returns:
-        Conditional return based on split-K mode:
-        - Non-split case (split_k_size <= 0 and use_heuristic=False):
-            out: Same shape as input q ([B, H, D] for varlen or [B, 1, H, D] for batch)
-                 with bfloat16 dtype
-            lse: [B, H, 1] (always float32)
-        - Split case (split_k_size > 0 or use_heuristic=True):
-            out: [B, H, num_splits, D] with float32 dtype (partial outputs for later reduction)
-            lse: [B, num_splits, H] (always float32)
+        Kernel output with Q dimension added:
+        - out: [B, 1, H, num_splits, D] (num_splits=1 when split-K disabled)
+        - lse: [B, num_splits, H, 1]
     """
     _validate_decode_inputs(q, k, v, seqlen_kv)
@@ -365,15 +360,12 @@ def cutlass_blackwell_fmha_decode_forward(
         split_k_size=split_k_size,
     )
-    # Handle output based on split-K mode
-    is_split = split_k_size > 0
-    if not is_split:
-        # out shape: [B, H, Splits = 1, D] -> original shape
-        out = out.view(*original_shape)
-        # lse shape: [B, Splits = 1, H] -> [B, H, 1]
-        lse = lse.view(batch_size, -1, 1)
+    # Kernel returns: out [B, H, num_splits, D], lse [B, num_splits, H]
+    # Reshape to consistent format with Q dimension:
+    # out: [B, H, num_splits, D] -> [B, 1, H, num_splits, D]
+    # lse: [B, num_splits, H] -> [B, num_splits, H, 1]
+    out = out.unsqueeze(1)  # [B, 1, H, num_splits, D]
+    lse = lse.unsqueeze(-1)  # [B, num_splits, H, 1]
     return out, lse

fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/split_embedding_configs.py CHANGED Viewed

@@ -313,6 +313,40 @@ def sparse_type_to_int(sparse_type: "SparseType") -> int:
     }[sparse_type.value]
+def sparse_type_int_to_dtype(ty: int) -> torch.dtype:
+    """
+    TorchScript-compatible function to convert an SparseType enum as integer) to torch.dtype.
+    This is a standalone function equivalent to SparseType.from_int(dtype_int).as_dtype() that works
+    with TorchScript. TorchScript does not support @staticmethod on Enum classes,
+    so this function provides a workaround.
+    """
+    if ty == 0:  # fp32
+        return torch.float32
+    elif ty == 1:  # fp16
+        return torch.float16
+    elif ty == 2:  # int8
+        return torch.uint8
+    elif ty == 3:  # int4
+        return torch.quint4x2
+    elif ty == 4:  # int2
+        return torch.quint2x4
+    elif ty == 5:  # bf16
+        return torch.bfloat16
+    elif ty == 6:  # fp8
+        return torch.uint8
+    elif ty == 7:  # mx4
+        return torch.uint8
+    elif ty == 9:
+        return (
+            torch.float8_e4m3fnuz
+            if torch.version.hip is not None
+            else torch.float8_e4m3fn
+        )
+    else:  # Invalid is 7 or non enumerated.
+        raise ValueError(f"Unsupported sparse type: {ty}")
 @enum.unique
 class SparseType(enum.Enum):
     FP32 = "fp32"

fbgemm_gpu/split_table_batched_embeddings_ops_training.py CHANGED Viewed

@@ -49,6 +49,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     SplitState,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
+    check_allocated_vbe_output,
     generate_vbe_metadata,
     is_torchdynamo_compiling,
 )
@@ -60,6 +61,7 @@ from fbgemm_gpu.tbe_input_multiplexer import (
 )
 from fbgemm_gpu.utils.loader import load_torch_module, load_torch_module_bc
+from fbgemm_gpu.utils.writeback_util import writeback_gradient
 try:
     load_torch_module(
@@ -159,6 +161,7 @@ class UserEnabledConfigDefinition:
     # More details can be found in D64848802.
     use_rowwise_bias_correction: bool = False
     use_writeback_bwd_prehook: bool = False
+    writeback_first_feature_only: bool = False
 @dataclass(frozen=True)
@@ -1181,6 +1184,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self.use_writeback_bwd_prehook: bool = (
             extra_optimizer_config.use_writeback_bwd_prehook
         )
+        writeback_first_feature_only: bool = (
+            extra_optimizer_config.writeback_first_feature_only
+        )
         self.log(f"self.extra_optimizer_config is {extra_optimizer_config}")
         if self.use_rowwise_bias_correction and not self.optimizer == OptimType.ADAM:
             raise AssertionError(
@@ -1469,6 +1476,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         #     self.log("TBE_V2 Knob is set to True; Using experimental TBE")
         self.is_experimental: bool = is_experimental
+        self._writeback_first_feature_only: bool = writeback_first_feature_only
         # Get a debug function pointer
         self._debug_print_input_stats: Callable[..., None] = (
@@ -1483,7 +1491,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         if optimizer == OptimType.EXACT_SGD and self.use_writeback_bwd_prehook:
             # Register writeback hook for Exact_SGD optimizer
             self.log(
-                "SplitTableBatchedEmbeddingBagsCodegen:  use_writeback_bwd_prehook is enabled."
+                f"SplitTableBatchedEmbeddingBagsCodegen:  use_writeback_bwd_prehook is enabled with first feature only={self._writeback_first_feature_only}"
             )
             # pyre-fixme[6]: Expected `typing.Callable[[Module, Union[Tensor, typing.Tuple[Tensor, ...]]], Union[None, Tensor, typing.Tuple[Tensor, ...]]]`
             self.register_full_backward_pre_hook(self.writeback_hook)
@@ -2003,6 +2011,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self,
         offsets: Tensor,
         batch_size_per_feature_per_rank: Optional[list[list[int]]],
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -2025,6 +2035,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             self.pooling_mode,
             self.feature_dims,
             self.current_device,
+            vbe_output,
+            vbe_output_offsets,
         )
     @torch.jit.ignore
@@ -2033,40 +2045,17 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         # This allows models using this class to compile correctly
         return FeatureGate.is_enabled(feature)
-    def writeback_update_gradient(
-        self, indices: torch.Tensor, offsets: torch.Tensor, grad: Tensor
-    ) -> Tensor:
-        if indices.numel() == 0:
-            return grad[0]
-        num_of_tables = len(set(self.feature_table_map))
-        assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
-        batch_size = offsets.shape[0] // num_of_tables
-        max_indices = indices.max()
-        non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
-        # disable dedup across different table
-        indices = ((offsets[non_empty_index]) // batch_size) * (
-            1 + max_indices
-        ) + indices
-        grad = grad[0]
-        _, idx, counts = torch.unique(
-            indices, dim=0, sorted=True, return_inverse=True, return_counts=True
-        )
-        _, ind_sorted = torch.sort(idx, stable=True)
-        cum_sum = counts.cumsum(0)
-        cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
-        first_indicies = ind_sorted[cum_sum]
-        mask = torch.zeros_like(grad, device=grad.device)
-        original_index = non_empty_index[first_indicies]
-        mask[original_index] = grad[original_index]
-        return mask
     # pyre-fixme[2]: For 1st argument expected not ANY
     def writeback_hook(self, module: Any, grad: Tensor) -> tuple[Tensor]:
         indices = self._indices
         offsets = self._offsets
-        return (self.writeback_update_gradient(indices, offsets, grad),)
+        return writeback_gradient(
+            grad,
+            indices,
+            offsets,
+            self.feature_table_map,
+            self._writeback_first_feature_only,
+        )
     def forward(  # noqa: C901
         self,
@@ -2078,6 +2067,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         total_unique_indices: Optional[int] = None,
         hash_zch_identities: Optional[Tensor] = None,
         hash_zch_runtime_meta: Optional[Tensor] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> Tensor:
         """
         The forward pass function that
@@ -2130,13 +2121,22 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 be set when using `OptimType.NONE`. This is because TBE
                 requires this information for allocating the weight gradient
                 tensor in the backward pass.
             hash_zch_identities (Optional[Tensor]): The original raw IDs before
                 remapping to ZCH (Zero-Collision Hash) table slots. This tensor is
                 populated when using Multi-Probe Zero Collision Hash (MPZCH) modules
                 and is required for Raw Embedding Streaming (RES) to maintain
                 consistency between training and inference.
+            vbe_output (Optional[Tensor]): An optional 2-D tensor of size that
+                contains output for TBE VBE. The shape of the tensor is
+                [1, total_vbe_output_size] where total_vbe_output_size is the
+                output size across all ranks and all embedding tables.
+                If this tensor is not None, the TBE VBE forward output is written
+                to this tensor at the locations specified by `vbe_output_offsets`.
+            vbe_output_offsets (Optional[Tensor]): An optional 2-D tensor that
+                contains VBE output offsets to `vbe_output`. The shape of the
+                tensor is [num_ranks, num_features].
+                vbe_output_offsets[r][f] represents the starting offset for rank `r`
+                and feature `f`.
         Returns:
             A 2D-tensor containing looked up data. Shape `(B, total_D)` where `B` =
             batch size and `total_D` = the sum of all embedding dimensions in the
@@ -2210,8 +2210,16 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             batch_size_per_feature_per_rank,
             force_cast_input_types=True,
             prefetch_pipeline=False,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
+        # Only enable VBE if batch_size_per_feature_per_rank is not None
+        assert not (
+            batch_size_per_feature_per_rank is not None
+            and self.use_writeback_bwd_prehook
+        ), "VBE is not supported with writeback_bwd_prehook"
         # Print input stats if enable (for debugging purpose only)
         self._debug_print_input_stats(indices, offsets, per_sample_weights)
@@ -3875,6 +3883,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
         force_cast_input_types: bool = True,
         prefetch_pipeline: bool = False,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs as follows:
@@ -3901,9 +3911,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             metadata
         """
+        if vbe_output is not None or vbe_output_offsets is not None:
+            assert (
+                not self.use_cpu
+            ), "[TBE API v2] Using pre-allocated vbe_output is not supported on CPU"
+            check_allocated_vbe_output(
+                self.output_dtype,
+                batch_size_per_feature_per_rank,
+                vbe_output,
+                vbe_output_offsets,
+            )
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
-            offsets, batch_size_per_feature_per_rank
+            offsets, batch_size_per_feature_per_rank, vbe_output, vbe_output_offsets
         )
         vbe = vbe_metadata.B_offsets is not None
@@ -3976,7 +3997,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                     self.is_nobag,
                     vbe_metadata.max_B_feature_rank,
                     self.info_B_num_bits,
-                    offsets.numel() - 1,  # total_B
+                    offsets.numel() - 1,  # total_B,
+                    vbe_output_offsets,
                 )
             else:
                 b_t_map = None

fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py CHANGED Viewed

@@ -7,7 +7,7 @@
 # pyre-unsafe
-from typing import Optional
+from typing import List, Optional
 import torch
 from torch import Tensor
@@ -31,6 +31,7 @@ except Exception:
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
+from fbgemm_gpu.split_embedding_configs import sparse_type_int_to_dtype
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
@@ -40,6 +41,8 @@ def generate_vbe_metadata(
     pooling_mode: PoolingMode,
     feature_dims_cpu: Tensor,
     device: torch.device,
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
 ) -> invokers.lookup_args.VBEMetadata:
     """
     Generate VBE metadata based on batch_size_per_feature_per_rank.
@@ -133,6 +136,8 @@ def generate_vbe_metadata(
             max_B_feature_rank=max_B_feature_rank,
             # pyre-ignore
             output_size=output_size,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
     else:
         vbe_metadata = invokers.lookup_args.VBEMetadata(
@@ -142,5 +147,43 @@ def generate_vbe_metadata(
             max_B=-1,
             max_B_feature_rank=-1,
             output_size=-1,
+            vbe_output=None,
+            vbe_output_offsets=None,
         )
     return vbe_metadata
+def check_allocated_vbe_output(
+    output_dtype: int,
+    batch_size_per_feature_per_rank: Optional[List[List[int]]],
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
+) -> None:
+    assert (
+        batch_size_per_feature_per_rank is not None
+    ), "[Merged_VBE] vbe_output is passed, batch_size_per_feature_per_rank cannot be None"
+    assert (
+        vbe_output is not None
+    ), "[Merged_VBE] vbe_output_offsets is not None, vbe_output cannot be None"
+    assert (
+        vbe_output_offsets is not None
+    ), "[Merged_VBE] vbe_output is not None, vbe_output_offsets cannot be None"
+    num_features = len(batch_size_per_feature_per_rank)
+    num_ranks = len(batch_size_per_feature_per_rank[0])
+    assert vbe_output_offsets.shape == torch.Size(
+        [num_ranks, num_features]
+    ), f"[Merged_VBE] Mismatched vbe_output_offsets shape. batch_size_per_feature_per_rank={batch_size_per_feature_per_rank}. Expected: {torch.Size([num_ranks, num_features])}, Actual: {vbe_output_offsets.shape}"
+    assert (
+        vbe_output.dim() == 1
+    ), f"[Merged_VBE] vbe_output must have 1 dimension, but got {vbe_output.dim()}. vbe_output shape is {vbe_output.shape}"
+    assert (
+        vbe_output_offsets.device == vbe_output.device
+    ), "[Merged_VBE] vbe_output_offsets and vbe_output must be on the same device"
+    _output_dtype = sparse_type_int_to_dtype(output_dtype)
+    assert (
+        vbe_output.dtype == _output_dtype
+    ), f"[Merged_VBE] vbe_output dtype must match TBE output dtype {_output_dtype} (SparseType {output_dtype}), but got {vbe_output.dtype}"
+    assert (
+        vbe_output_offsets.is_contiguous()
+    ), "[Merged_VBE] vbe_output_offsets needs to be contiguous"
+    assert vbe_output.is_contiguous(), "[Merged_VBE] vbe_output needs to be contiguous"

fbgemm_gpu/tbe/ssd/training.py CHANGED Viewed

@@ -50,6 +50,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
     WeightDecayMode,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
+    check_allocated_vbe_output,
     generate_vbe_metadata,
     is_torchdynamo_compiling,
 )
@@ -2308,6 +2309,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self,
         offsets: Tensor,
         batch_size_per_feature_per_rank: Optional[list[list[int]]],
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -2326,6 +2329,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.pooling_mode,
             self.feature_dims,
             self.current_device,
+            vbe_output,
+            vbe_output_offsets,
         )
     def _increment_iteration(self) -> int:
@@ -2356,11 +2361,26 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         per_sample_weights: Optional[Tensor] = None,
         feature_requires_grad: Optional[Tensor] = None,
         batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
         # pyre-fixme[7]: Expected `Tensor` but got implicit return value of `None`.
     ) -> Tensor:
         self.clear_cache()
+        if vbe_output is not None or vbe_output_offsets is not None:
+            # CPU is not supported in SSD TBE
+            check_allocated_vbe_output(
+                self.output_dtype,
+                batch_size_per_feature_per_rank,
+                vbe_output,
+                vbe_output_offsets,
+            )
         indices, offsets, per_sample_weights, vbe_metadata = self.prepare_inputs(
-            indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
+            indices,
+            offsets,
+            per_sample_weights,
+            batch_size_per_feature_per_rank,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
         if len(self.timesteps_prefetched) == 0:
@@ -3691,13 +3711,15 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         offsets: Tensor,
         per_sample_weights: Optional[Tensor] = None,
         batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs
         """
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
-            offsets, batch_size_per_feature_per_rank
+            offsets, batch_size_per_feature_per_rank, vbe_output, vbe_output_offsets
         )
         # Force casting indices and offsets to long

fbgemm_gpu/utils/writeback_util.py ADDED Viewed

@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def writeback_update_gradient(
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    grad: torch.Tensor,
+    feature_table_map: list[int],
+) -> torch.Tensor:
+    """
+    Update gradient tensor by deduplicating indices across all features/tables.
+    For duplicate indices, only the first occurrence receives the gradient to achieve the assign purpose via gradient update
+    NOTE: This function is not supporting VBE yet
+    Args:
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        grad (torch.Tensor): Gradient tensor to be updated
+        feature_table_map (list[int]): Mapping from feature to table
+    Returns:
+        torch.Tensor: Updated gradient tensor with duplicates masked out
+    """
+    if indices.numel() == 0:
+        return grad[0]
+    # get num of feature to estimate batch size
+    num_of_tables = len(feature_table_map)
+    assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
+    batch_size = offsets.shape[0] // num_of_tables
+    max_indices = indices.max()
+    non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
+    # disable dedup across different table
+    indices = ((offsets[non_empty_index]) // batch_size) * (1 + max_indices) + indices
+    grad = grad[0]
+    _, idx, counts = torch.unique(
+        indices, dim=0, sorted=True, return_inverse=True, return_counts=True
+    )
+    _, ind_sorted = torch.sort(idx, stable=True)
+    cum_sum = counts.cumsum(0)
+    cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
+    first_indicies = ind_sorted[cum_sum]
+    mask = torch.zeros_like(grad, device=grad.device)
+    original_index = non_empty_index[first_indicies]
+    mask[original_index] = grad[original_index]
+    return mask
+def writeback_update_gradient_first_feature_only(
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    grad: torch.Tensor,
+    feature_table_map: list[int],
+) -> torch.Tensor:
+    """
+    Special case of writeback_update_gradient where gradient only needs to be updated for the first feature. Other features will be forward-only
+    NOTE: This function is not supporting VBE yet
+    Args:
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        grad (torch.Tensor): Gradient tensor to be updated
+        feature_table_map (list[int]): Mapping from feature to table
+    Returns:
+        torch.Tensor: Updated gradient tensor with duplicates masked out
+    """
+    num_of_tables = len(feature_table_map)
+    batch_size = (offsets.shape[0] - 1) // num_of_tables
+    shrink_indices = indices[: offsets[batch_size]]
+    if shrink_indices.numel() == 0 or indices.numel() == 0:
+        return grad[0]
+    assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
+    grad = grad[0]
+    _, idx, counts = torch.unique(
+        shrink_indices, dim=0, sorted=True, return_inverse=True, return_counts=True
+    )
+    _, ind_sorted = torch.sort(idx, stable=True)
+    cum_sum = counts.cumsum(0)
+    cum_sum = torch.cat((torch.tensor([0]).to(shrink_indices.device), cum_sum[:-1]))
+    first_indicies = ind_sorted[cum_sum]
+    mask = torch.zeros_like(grad, device=grad.device)
+    mask[first_indicies] = grad[first_indicies]
+    return mask
+def writeback_gradient(
+    grad: torch.Tensor,
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    feature_table_map: list[int],
+    writeback_first_feature_only: bool = False,
+) -> tuple[torch.Tensor]:
+    """
+    Compute deduplicated gradient for writeback operation.
+    Args:
+        grad (torch.Tensor): Gradient tensor to be updated
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        feature_table_map (list[int]): Mapping from feature to table
+        writeback_first_feature_only (bool): If True, only first feature will apply gradient update, other features will be read-only
+    Returns:
+        tuple[torch.Tensor]: Tuple containing the updated gradient tensor
+    """
+    if writeback_first_feature_only:
+        return (
+            writeback_update_gradient_first_feature_only(
+                indices, offsets, grad, feature_table_map
+            ),
+        )
+    else:
+        return (writeback_update_gradient(indices, offsets, grad, feature_table_map),)

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_genai_nightly
-Version: 2025.12.17
+Version: 2026.1.4
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.4.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 fbgemm_gpu/__init__.py,sha256=bL2dL7uYeXb1GvdjIDUTcLXLRGNfmnI4MQoE3-Gg5m8,6361
-fbgemm_gpu/asmjit.so,sha256=231yAFvSUfy_B5xni9sAPQlsi5so9alFN3tXN7GFcMQ,484232
+fbgemm_gpu/asmjit.so,sha256=UxnhHlu9LgmoRXa8fZwSX56b5QKffBxfAOs0AZLxRfk,501728
 fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
 fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
-fbgemm_gpu/fbgemm.so,sha256=_fCdNktofSTSuedF0cLL3AKDTeKca5tty8RnRzKFCdg,5803160
+fbgemm_gpu/fbgemm.so,sha256=U864UANx-CVyFYk5ADawCd0uWRfntHaVcyl6AVty_3Q,5642616
 fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
 fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
 fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
@@ -10,20 +10,20 @@ fbgemm_gpu/quantize_comm.py,sha256=ZfXtRHfqpVpV6k2PDL6oTUkKYzopqAV2M6vavp_RLSM,1
 fbgemm_gpu/quantize_utils.py,sha256=q8Aokk6nlHbXF6HcDBbhBCAGSZV4klM8uPF-MUFFtAw,8324
 fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
 fbgemm_gpu/sparse_ops.py,sha256=_EJC1pAbNnAnVQQ5JBg4DAV2TboIj-4XQkiKMmg1vXI,50417
-fbgemm_gpu/split_embedding_configs.py,sha256=fv29efZGD_cvh5KwdvTFD6GZtqJLYjWXW_0vMeyT_6k,15483
+fbgemm_gpu/split_embedding_configs.py,sha256=EuVFKIDrgRQpRC5mmB4Du6WftK5GXJvDue9_ezt_eBI,16575
 fbgemm_gpu/split_embedding_inference_converter.py,sha256=AghGW22MgMsdHzdwdPMPYDjgas5AE_estckY8rMgXVU,7056
 fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
 fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
 fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
 fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=eFxb_bDfBV8G76pmd-SxDXXXnqgbuGYOS4pSU8JS5dg,19295
 fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
-fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=D72laY5iFC3_6f_qHnPMizDDxwI0QW7-21RyY0ZikK4,187705
-fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=e3O9ElaWBGvG7TdT3Ok_8cB06jhskXuyCQ0t40dzsEY,5449
+fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=rNGMELM_xFIsdS_340PB7bsn9h_VjONq_JJG1SjHyvQ,188992
+fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
 fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
 fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
 fbgemm_gpu/uvm.py,sha256=guNK8ZzR80jmv-CyRgEhxhVYhjz3R9d6tB8Hu1uWDUo,1047
 fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
-fbgemm_gpu/config/feature_list.py,sha256=iDOGr9nwTqUhWsqOefRIqIo1jwLSeSII4jGnLeU01kg,2359
+fbgemm_gpu/config/feature_list.py,sha256=hhDNkkafd-Oetvuqv9ylBVTNM-lKPi029mpRqq-JZCA,2467
 fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
 fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
 fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
@@ -32,9 +32,9 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
-fbgemm_gpu/docs/target.genai.json.py,sha256=Zzc84wR-3UYjzYFUQk2gX2r6FEia8mMClTg1gA1HVoc,79
+fbgemm_gpu/docs/target.genai.json.py,sha256=5TMzQCJ6eRjDaUActAOucxjizI7IZg56rn512-ujiE4,77
 fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
-fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=Mt99lNGcaYTxWVGqPP8Q2l-n_7lj2DNmPHura1eHAMM,183392
+fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=y0Z22D1LnOkH9vXtRVPYWJ5raZC27OTViPEtnqi8TyY,190656
 fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
 fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
 fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=R4VNZdPSgmRmwDfTt2CShED2SGUF6dCXSUW2C4LISgE,215713
@@ -43,11 +43,11 @@ fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=5ClZ-GDrx6q0uaqW
 fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
 fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
 fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
-fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=rR2xW3Km17SqFFHLL-1WKIQ2hxd7-UpiEbEQmsvx8z8,64298336
+fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=2iHWrQDzhysRNMPbjFQpsxNdAkIRq__vTHy75sa4kJo,65238760
 fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=-R_LxyHpdXMILU9TNuYoRisBCkfK0_VLyixefaeZf4g,1463
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=gbhNU3mDTKJb3yt3inIDbiUjX_SG1oZfzgDygtHvMpk,10101
-fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=r1AhV2qdIqxtYYeze6yr6_wg_Xzfzc4QJEBeNsGY4Gw,17570
+fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=fD39_WH7TfNCiP5Vl46ToX6PsLMLUFLhizT26Qe7TWg,17282
 fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=XpAK_eyqDSKeFC5J9KpnKtbZG07mrDh9d2j1LFKzr-8,404
 fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
 fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
@@ -99,7 +99,7 @@ fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29Xt
 fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
 fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
 fbgemm_gpu/tbe/ssd/inference.py,sha256=B_uX66ajGA9YKGlFa5TmGWs7b-b1RFigzwxmENZ9Oio,22816
-fbgemm_gpu/tbe/ssd/training.py,sha256=ElFvQHF5wQBzrqU34F6ZR2IEBVzKO3j3symntP15S3E,211380
+fbgemm_gpu/tbe/ssd/training.py,sha256=C6M3H_f8oWWRkC4R-BJED73au-Gl9SUVllxOoFSiDkI,212234
 fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
 fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
 fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
@@ -119,9 +119,10 @@ fbgemm_gpu/utils/__init__.py,sha256=JQQNdcTTaEU6ptK-OW-ZQBwTFxEZZpWOtBXWwEZm39o,
 fbgemm_gpu/utils/filestore.py,sha256=oVtbKGaPQki1JgbJCkrkElukOFVyxntQpSC0lYBKgho,6455
 fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,990
 fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
+fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
 list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
 list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/METADATA,sha256=oJzBJPiPBYhvls7W-MDbX-yBH6y4CRyGDHNvDFaAyBU,2657
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/RECORD,,
+fbgemm_gpu_genai_nightly-2026.1.4.dist-info/METADATA,sha256=MjhefCkWlccqGa-waygmSKkW1vaKWbpxX1U8VLRrMJ0,2655
+fbgemm_gpu_genai_nightly-2026.1.4.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
+fbgemm_gpu_genai_nightly-2026.1.4.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
+fbgemm_gpu_genai_nightly-2026.1.4.dist-info/RECORD,,

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes