PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.17__cp313-cp313-manylinux_2_28_x86_64.whl → 2026.1.9__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.17__cp313-cp313-manylinux_2_28_x86_64.whl → 2026.1.9__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

fbgemm_gpu/asmjit.so CHANGED Viewed

Binary file

fbgemm_gpu/config/feature_list.py CHANGED Viewed

@@ -63,6 +63,9 @@ class FeatureGateName(Enum):
     # Enable TBE input parameters extraction
     TBE_REPORT_INPUT_PARAMS = auto()
+    # Enable tuned max segment length per CTA for B200
+    TBE_USE_TUNED_SEGMENT_LENGTHS_CTA_B200 = auto()
     def is_enabled(self) -> bool:
         return FeatureGate.is_enabled(self)

fbgemm_gpu/docs/target.genai.json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "version": "2025.12.17",
+    "version": "2026.1.9",
     "target": "genai",
     "variant": "cuda"
 }

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py CHANGED Viewed

@@ -289,7 +289,7 @@ def cutlass_blackwell_fmha_decode_forward(
     window_left: int = -1,
     window_right: int = -1,
     bottom_right: bool = True,
-    split_k_size: int = 1024,
+    split_k_size: int = 0,
     use_heuristic: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -318,14 +318,9 @@ def cutlass_blackwell_fmha_decode_forward(
                       split size using the heuristic. Default is True.
     Returns:
-        Conditional return based on split-K mode:
-        - Non-split case (split_k_size <= 0 and use_heuristic=False):
-            out: Same shape as input q ([B, H, D] for varlen or [B, 1, H, D] for batch)
-                 with bfloat16 dtype
-            lse: [B, H, 1] (always float32)
-        - Split case (split_k_size > 0 or use_heuristic=True):
-            out: [B, H, num_splits, D] with float32 dtype (partial outputs for later reduction)
-            lse: [B, num_splits, H] (always float32)
+        Kernel output with Q dimension added:
+        - out: [B, 1, H, num_splits, D] (num_splits=1 when split-K disabled)
+        - lse: [B, num_splits, H, 1]
     """
     _validate_decode_inputs(q, k, v, seqlen_kv)
@@ -365,15 +360,12 @@ def cutlass_blackwell_fmha_decode_forward(
         split_k_size=split_k_size,
     )
-    # Handle output based on split-K mode
-    is_split = split_k_size > 0
-    if not is_split:
-        # out shape: [B, H, Splits = 1, D] -> original shape
-        out = out.view(*original_shape)
-        # lse shape: [B, Splits = 1, H] -> [B, H, 1]
-        lse = lse.view(batch_size, -1, 1)
+    # Kernel returns: out [B, H, num_splits, D], lse [B, num_splits, H]
+    # Reshape to consistent format with Q dimension:
+    # out: [B, H, num_splits, D] -> [B, 1, H, num_splits, D]
+    # lse: [B, num_splits, H] -> [B, num_splits, H, 1]
+    out = out.unsqueeze(1)  # [B, 1, H, num_splits, D]
+    lse = lse.unsqueeze(-1)  # [B, num_splits, H, 1]
     return out, lse

fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/quantize_comm.py CHANGED Viewed

@@ -25,7 +25,7 @@ from fbgemm_gpu.quantize_utils import (
     fp32_to_hfp8_with_clamp,
     fp32_to_mx4,
     hfp8_to_fp32,
-    mx4_to_fp32,
+    mx4_to_float,
     RoundingMode,
 )
@@ -123,7 +123,7 @@ def _dequantize_tensor(
     comm_precision: SparseType,
     ctx: Optional[QuantizationContext] = None,
     is_fwd: bool = True,
-    fp8_output_dtype: Optional[SparseType] = None,
+    output_dtype: Optional[SparseType] = None,
 ) -> torch.Tensor:
     if comm_precision == SparseType.FP32:
         assert quantized_tensor.dtype == torch.float
@@ -138,10 +138,8 @@ def _dequantize_tensor(
         if ctx is not None and ctx.row_dim > 0:
             row_dim_quant = ctx.row_dim_quant
             quantized_tensor_2d = quantized_tensor.view((-1, row_dim_quant))
-            # use provided fp8_output_dtype or default to FP32 (0)
-            output_dtype_int = (
-                fp8_output_dtype.as_int() if fp8_output_dtype is not None else 0
-            )
+            # use provided output_dtype or default to FP32 (0)
+            output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
             dequant_tensor = torch.ops.fbgemm.FP8RowwiseQuantizedToFloat(
                 quantized_tensor_2d,
                 is_fwd,
@@ -161,7 +159,7 @@ def _dequantize_tensor(
         return dequant_tensor.view(-1)
     elif comm_precision == SparseType.MX4:
         mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
-        return mx4_to_fp32(quantized_tensor, mx_group_size)
+        return mx4_to_float(quantized_tensor, mx_group_size, output_dtype=output_dtype)
     else:
         raise ValueError(f"comm_precision={comm_precision} is not supported")
@@ -175,7 +173,7 @@ class QuantizedCommCodec:
         row_dim: Optional[int] = None,
         is_fwd: bool = True,
         rounding_mode: Optional[RoundingMode] = None,
-        fp8_output_dtype: Optional[SparseType] = None,
+        output_dtype: Optional[SparseType] = None,
     ) -> None:
         if loss_scale is not None:
             if comm_precision not in [SparseType.FP16, SparseType.BF16]:
@@ -193,7 +191,7 @@ class QuantizedCommCodec:
         self._is_fwd = is_fwd
         self._row_dim: int = -1 if row_dim is None else row_dim
         self._rounding_mode: Optional[RoundingMode] = rounding_mode
-        self._fp8_output_dtype: Optional[SparseType] = fp8_output_dtype
+        self._output_dtype: Optional[SparseType] = output_dtype
         if self._comm_precision == SparseType.MX4:
             self._row_dim = MX_GROUP_SIZE_DEFAULT if row_dim is None else row_dim
             self._rounding_mode = (
@@ -229,7 +227,7 @@ class QuantizedCommCodec:
                 self._comm_precision,
                 ctx,
                 self._is_fwd,
-                fp8_output_dtype=self._fp8_output_dtype,
+                output_dtype=self._output_dtype,
             )
         return dequantized_tensor

fbgemm_gpu/quantize_utils.py CHANGED Viewed

@@ -14,9 +14,15 @@ import torch  # isort:skip
 import fbgemm_gpu
-from fbgemm_gpu.triton import dequantize_mx4, quantize_mx4, RoundingMode
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.triton.common import RoundingMode
 from fbgemm_gpu.triton.quantize_ref import py_dequantize_mx4, py_quantize_mx4
+if torch.cuda.is_available():
+    from fbgemm_gpu.triton import quantize_mx4
+    from fbgemm_gpu.triton.quantize import triton_dequantize_mx4
 try:
     # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
     open_source = bool(getattr(fbgemm_gpu, "open_source", False))
@@ -126,25 +132,71 @@ def mx4_to_fp32(
 ) -> torch.Tensor:
     """Dequantize an MX4 tensor to FP32 with triton or native cuda impl.
+    This function is kept for backward compatibility and always returns FP32.
+    For BF16 output, use mx4_to_float() with output_dtype=SparseType.BF16.
+    """
+    return mx4_to_float(
+        tensor,
+        group_size,
+        use_triton,
+        ebits,
+        mbits,
+        output_dtype=None,  # None = FP32 default for backward compatibility
+    )
+def mx4_to_float(
+    tensor: torch.Tensor,
+    group_size: int = 32,
+    use_triton: bool = True,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: Optional[SparseType] = None,
+) -> torch.Tensor:
+    """Dequantize an MX4 tensor to FP32 or BF16 with triton or native cuda impl.
     Args:
         tensor (torch.Tensor): MX4 packed tensor with total elements (M / 2 + M / groupsize)
         group_size (int): Compute scale in chunks of group_size.
         use_triton (bool): If set, use triton quantization, otherwise cuda.
         ebits (int): Number of exponent bits in target mx4 format.
         mbits (int): Number of mantissa bits in target mx4 format.
+        output_dtype (Optional[SparseType]): Output dtype (FP32 or BF16).
+            Defaults to None (FP32) for backward compatibility.
     Return:
-        output: FP32 tensor with total elements (M).
+        output: Tensor with dtype matching output_dtype and total elements (M).
     """
+    # Validate output_dtype
+    supported_dtypes = {SparseType.FP32, SparseType.BF16}
+    if output_dtype is not None and output_dtype not in supported_dtypes:
+        raise ValueError(
+            f"output_dtype must be one of {supported_dtypes}, got {output_dtype}. "
+            f"FP16 is not supported due to potential overflow/underflow with MX4's wide exponent range. "
+            f"Use BF16 for memory savings with same dynamic range as FP32."
+        )
+    target_dtype = (
+        output_dtype.as_dtype() if output_dtype is not None else torch.float32
+    )
     # Accelerated MX4 dequantize is only available on cuda, if input is on cpu, use python.
     if not tensor.is_cuda and not tensor.is_mtia:
-        return py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        result = py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        return result.to(target_dtype) if output_dtype is not None else result
     if use_triton:
         if tensor.is_mtia:
-            return mtia_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
-        return dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+            return mtia_dequantize_mx4(
+                tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+            )
+        return triton_dequantize_mx4(
+            tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+        )
     else:
-        return torch.ops.fbgemm.dequantize_mx_cuda(tensor.flatten(), group_size)
+        output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
+        return torch.ops.fbgemm.dequantize_mx_cuda(
+            tensor.flatten(), group_size, output_dtype_int
+        )
 def fp32_to_fp16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:

fbgemm_gpu/split_embedding_configs.py CHANGED Viewed

@@ -313,6 +313,40 @@ def sparse_type_to_int(sparse_type: "SparseType") -> int:
     }[sparse_type.value]
+def sparse_type_int_to_dtype(ty: int) -> torch.dtype:
+    """
+    TorchScript-compatible function to convert an SparseType enum as integer) to torch.dtype.
+    This is a standalone function equivalent to SparseType.from_int(dtype_int).as_dtype() that works
+    with TorchScript. TorchScript does not support @staticmethod on Enum classes,
+    so this function provides a workaround.
+    """
+    if ty == 0:  # fp32
+        return torch.float32
+    elif ty == 1:  # fp16
+        return torch.float16
+    elif ty == 2:  # int8
+        return torch.uint8
+    elif ty == 3:  # int4
+        return torch.quint4x2
+    elif ty == 4:  # int2
+        return torch.quint2x4
+    elif ty == 5:  # bf16
+        return torch.bfloat16
+    elif ty == 6:  # fp8
+        return torch.uint8
+    elif ty == 7:  # mx4
+        return torch.uint8
+    elif ty == 9:
+        return (
+            torch.float8_e4m3fnuz
+            if torch.version.hip is not None
+            else torch.float8_e4m3fn
+        )
+    else:  # Invalid is 7 or non enumerated.
+        raise ValueError(f"Unsupported sparse type: {ty}")
 @enum.unique
 class SparseType(enum.Enum):
     FP32 = "fp32"

fbgemm_gpu/split_table_batched_embeddings_ops_training.py CHANGED Viewed

@@ -49,6 +49,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     SplitState,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
+    check_allocated_vbe_output,
     generate_vbe_metadata,
     is_torchdynamo_compiling,
 )
@@ -60,6 +61,7 @@ from fbgemm_gpu.tbe_input_multiplexer import (
 )
 from fbgemm_gpu.utils.loader import load_torch_module, load_torch_module_bc
+from fbgemm_gpu.utils.writeback_util import writeback_gradient
 try:
     load_torch_module(
@@ -159,6 +161,7 @@ class UserEnabledConfigDefinition:
     # More details can be found in D64848802.
     use_rowwise_bias_correction: bool = False
     use_writeback_bwd_prehook: bool = False
+    writeback_first_feature_only: bool = False
 @dataclass(frozen=True)
@@ -1181,6 +1184,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self.use_writeback_bwd_prehook: bool = (
             extra_optimizer_config.use_writeback_bwd_prehook
         )
+        writeback_first_feature_only: bool = (
+            extra_optimizer_config.writeback_first_feature_only
+        )
         self.log(f"self.extra_optimizer_config is {extra_optimizer_config}")
         if self.use_rowwise_bias_correction and not self.optimizer == OptimType.ADAM:
             raise AssertionError(
@@ -1469,6 +1476,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         #     self.log("TBE_V2 Knob is set to True; Using experimental TBE")
         self.is_experimental: bool = is_experimental
+        self._writeback_first_feature_only: bool = writeback_first_feature_only
         # Get a debug function pointer
         self._debug_print_input_stats: Callable[..., None] = (
@@ -1483,7 +1491,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         if optimizer == OptimType.EXACT_SGD and self.use_writeback_bwd_prehook:
             # Register writeback hook for Exact_SGD optimizer
             self.log(
-                "SplitTableBatchedEmbeddingBagsCodegen:  use_writeback_bwd_prehook is enabled."
+                f"SplitTableBatchedEmbeddingBagsCodegen:  use_writeback_bwd_prehook is enabled with first feature only={self._writeback_first_feature_only}"
             )
             # pyre-fixme[6]: Expected `typing.Callable[[Module, Union[Tensor, typing.Tuple[Tensor, ...]]], Union[None, Tensor, typing.Tuple[Tensor, ...]]]`
             self.register_full_backward_pre_hook(self.writeback_hook)
@@ -2003,6 +2011,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self,
         offsets: Tensor,
         batch_size_per_feature_per_rank: Optional[list[list[int]]],
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -2025,6 +2035,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             self.pooling_mode,
             self.feature_dims,
             self.current_device,
+            vbe_output,
+            vbe_output_offsets,
         )
     @torch.jit.ignore
@@ -2033,40 +2045,17 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         # This allows models using this class to compile correctly
         return FeatureGate.is_enabled(feature)
-    def writeback_update_gradient(
-        self, indices: torch.Tensor, offsets: torch.Tensor, grad: Tensor
-    ) -> Tensor:
-        if indices.numel() == 0:
-            return grad[0]
-        num_of_tables = len(set(self.feature_table_map))
-        assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
-        batch_size = offsets.shape[0] // num_of_tables
-        max_indices = indices.max()
-        non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
-        # disable dedup across different table
-        indices = ((offsets[non_empty_index]) // batch_size) * (
-            1 + max_indices
-        ) + indices
-        grad = grad[0]
-        _, idx, counts = torch.unique(
-            indices, dim=0, sorted=True, return_inverse=True, return_counts=True
-        )
-        _, ind_sorted = torch.sort(idx, stable=True)
-        cum_sum = counts.cumsum(0)
-        cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
-        first_indicies = ind_sorted[cum_sum]
-        mask = torch.zeros_like(grad, device=grad.device)
-        original_index = non_empty_index[first_indicies]
-        mask[original_index] = grad[original_index]
-        return mask
     # pyre-fixme[2]: For 1st argument expected not ANY
     def writeback_hook(self, module: Any, grad: Tensor) -> tuple[Tensor]:
         indices = self._indices
         offsets = self._offsets
-        return (self.writeback_update_gradient(indices, offsets, grad),)
+        return writeback_gradient(
+            grad,
+            indices,
+            offsets,
+            self.feature_table_map,
+            self._writeback_first_feature_only,
+        )
     def forward(  # noqa: C901
         self,
@@ -2078,6 +2067,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         total_unique_indices: Optional[int] = None,
         hash_zch_identities: Optional[Tensor] = None,
         hash_zch_runtime_meta: Optional[Tensor] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> Tensor:
         """
         The forward pass function that
@@ -2130,13 +2121,22 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 be set when using `OptimType.NONE`. This is because TBE
                 requires this information for allocating the weight gradient
                 tensor in the backward pass.
             hash_zch_identities (Optional[Tensor]): The original raw IDs before
                 remapping to ZCH (Zero-Collision Hash) table slots. This tensor is
                 populated when using Multi-Probe Zero Collision Hash (MPZCH) modules
                 and is required for Raw Embedding Streaming (RES) to maintain
                 consistency between training and inference.
+            vbe_output (Optional[Tensor]): An optional 2-D tensor of size that
+                contains output for TBE VBE. The shape of the tensor is
+                [1, total_vbe_output_size] where total_vbe_output_size is the
+                output size across all ranks and all embedding tables.
+                If this tensor is not None, the TBE VBE forward output is written
+                to this tensor at the locations specified by `vbe_output_offsets`.
+            vbe_output_offsets (Optional[Tensor]): An optional 2-D tensor that
+                contains VBE output offsets to `vbe_output`. The shape of the
+                tensor is [num_ranks, num_features].
+                vbe_output_offsets[r][f] represents the starting offset for rank `r`
+                and feature `f`.
         Returns:
             A 2D-tensor containing looked up data. Shape `(B, total_D)` where `B` =
             batch size and `total_D` = the sum of all embedding dimensions in the
@@ -2210,8 +2210,16 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             batch_size_per_feature_per_rank,
             force_cast_input_types=True,
             prefetch_pipeline=False,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
+        # Only enable VBE if batch_size_per_feature_per_rank is not None
+        assert not (
+            batch_size_per_feature_per_rank is not None
+            and self.use_writeback_bwd_prehook
+        ), "VBE is not supported with writeback_bwd_prehook"
         # Print input stats if enable (for debugging purpose only)
         self._debug_print_input_stats(indices, offsets, per_sample_weights)
@@ -3875,6 +3883,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
         force_cast_input_types: bool = True,
         prefetch_pipeline: bool = False,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs as follows:
@@ -3901,9 +3911,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             metadata
         """
+        if vbe_output is not None or vbe_output_offsets is not None:
+            assert (
+                not self.use_cpu
+            ), "[TBE API v2] Using pre-allocated vbe_output is not supported on CPU"
+            check_allocated_vbe_output(
+                self.output_dtype,
+                batch_size_per_feature_per_rank,
+                vbe_output,
+                vbe_output_offsets,
+            )
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
-            offsets, batch_size_per_feature_per_rank
+            offsets, batch_size_per_feature_per_rank, vbe_output, vbe_output_offsets
         )
         vbe = vbe_metadata.B_offsets is not None
@@ -3976,7 +3997,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                     self.is_nobag,
                     vbe_metadata.max_B_feature_rank,
                     self.info_B_num_bits,
-                    offsets.numel() - 1,  # total_B
+                    offsets.numel() - 1,  # total_B,
+                    vbe_output_offsets,
                 )
             else:
                 b_t_map = None

fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py CHANGED Viewed

@@ -7,7 +7,7 @@
 # pyre-unsafe
-from typing import Optional
+from typing import List, Optional
 import torch
 from torch import Tensor
@@ -31,6 +31,7 @@ except Exception:
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
+from fbgemm_gpu.split_embedding_configs import sparse_type_int_to_dtype
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
@@ -40,6 +41,8 @@ def generate_vbe_metadata(
     pooling_mode: PoolingMode,
     feature_dims_cpu: Tensor,
     device: torch.device,
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
 ) -> invokers.lookup_args.VBEMetadata:
     """
     Generate VBE metadata based on batch_size_per_feature_per_rank.
@@ -133,6 +136,8 @@ def generate_vbe_metadata(
             max_B_feature_rank=max_B_feature_rank,
             # pyre-ignore
             output_size=output_size,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
     else:
         vbe_metadata = invokers.lookup_args.VBEMetadata(
@@ -142,5 +147,43 @@ def generate_vbe_metadata(
             max_B=-1,
             max_B_feature_rank=-1,
             output_size=-1,
+            vbe_output=None,
+            vbe_output_offsets=None,
         )
     return vbe_metadata
+def check_allocated_vbe_output(
+    output_dtype: int,
+    batch_size_per_feature_per_rank: Optional[List[List[int]]],
+    vbe_output: Optional[Tensor] = None,
+    vbe_output_offsets: Optional[Tensor] = None,
+) -> None:
+    assert (
+        batch_size_per_feature_per_rank is not None
+    ), "[Merged_VBE] vbe_output is passed, batch_size_per_feature_per_rank cannot be None"
+    assert (
+        vbe_output is not None
+    ), "[Merged_VBE] vbe_output_offsets is not None, vbe_output cannot be None"
+    assert (
+        vbe_output_offsets is not None
+    ), "[Merged_VBE] vbe_output is not None, vbe_output_offsets cannot be None"
+    num_features = len(batch_size_per_feature_per_rank)
+    num_ranks = len(batch_size_per_feature_per_rank[0])
+    assert vbe_output_offsets.shape == torch.Size(
+        [num_ranks, num_features]
+    ), f"[Merged_VBE] Mismatched vbe_output_offsets shape. batch_size_per_feature_per_rank={batch_size_per_feature_per_rank}. Expected: {torch.Size([num_ranks, num_features])}, Actual: {vbe_output_offsets.shape}"
+    assert (
+        vbe_output.dim() == 1
+    ), f"[Merged_VBE] vbe_output must have 1 dimension, but got {vbe_output.dim()}. vbe_output shape is {vbe_output.shape}"
+    assert (
+        vbe_output_offsets.device == vbe_output.device
+    ), "[Merged_VBE] vbe_output_offsets and vbe_output must be on the same device"
+    _output_dtype = sparse_type_int_to_dtype(output_dtype)
+    assert (
+        vbe_output.dtype == _output_dtype
+    ), f"[Merged_VBE] vbe_output dtype must match TBE output dtype {_output_dtype} (SparseType {output_dtype}), but got {vbe_output.dtype}"
+    assert (
+        vbe_output_offsets.is_contiguous()
+    ), "[Merged_VBE] vbe_output_offsets needs to be contiguous"
+    assert vbe_output.is_contiguous(), "[Merged_VBE] vbe_output needs to be contiguous"

fbgemm_gpu/tbe/ssd/training.py CHANGED Viewed

@@ -50,6 +50,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
     WeightDecayMode,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
+    check_allocated_vbe_output,
     generate_vbe_metadata,
     is_torchdynamo_compiling,
 )
@@ -2308,6 +2309,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self,
         offsets: Tensor,
         batch_size_per_feature_per_rank: Optional[list[list[int]]],
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -2326,6 +2329,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.pooling_mode,
             self.feature_dims,
             self.current_device,
+            vbe_output,
+            vbe_output_offsets,
         )
     def _increment_iteration(self) -> int:
@@ -2356,11 +2361,26 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         per_sample_weights: Optional[Tensor] = None,
         feature_requires_grad: Optional[Tensor] = None,
         batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
         # pyre-fixme[7]: Expected `Tensor` but got implicit return value of `None`.
     ) -> Tensor:
         self.clear_cache()
+        if vbe_output is not None or vbe_output_offsets is not None:
+            # CPU is not supported in SSD TBE
+            check_allocated_vbe_output(
+                self.output_dtype,
+                batch_size_per_feature_per_rank,
+                vbe_output,
+                vbe_output_offsets,
+            )
         indices, offsets, per_sample_weights, vbe_metadata = self.prepare_inputs(
-            indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
+            indices,
+            offsets,
+            per_sample_weights,
+            batch_size_per_feature_per_rank,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
         if len(self.timesteps_prefetched) == 0:
@@ -3691,13 +3711,15 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         offsets: Tensor,
         per_sample_weights: Optional[Tensor] = None,
         batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs
         """
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
-            offsets, batch_size_per_feature_per_rank
+            offsets, batch_size_per_feature_per_rank, vbe_output, vbe_output_offsets
         )
         # Force casting indices and offsets to long

fbgemm_gpu/triton/quantize.py CHANGED Viewed

@@ -575,7 +575,7 @@ def _kernel_dequantize_mx4(
         # Write final outputs.
         tl.store(
             out + output_offset,
-            scaled_fp32,
+            scaled_fp32.to(out.dtype.element_ty),
             # Mask values that are out of this chunk or the main array.
             mask=(output_offset < OUTPUT_SIZE)
             & (output_offset < OUTPUT_CHUNK_SIZE * (pid + 1)),
@@ -588,10 +588,14 @@ def _kernel_dequantize_mx4(
 def triton_dequantize_mx4(
-    a: torch.Tensor, group_size: int = 32, ebits: int = 2, mbits: int = 1
+    a: torch.Tensor,
+    group_size: int = 32,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """
-    Dequantize a tensor from mx4 format to fp32.
+    Dequantize a tensor from mx4 format to fp32 or bf16.
     Args:
         a (Tensor): [M / 2 + M / group_size] MX4 tensor packed into int8 values
@@ -599,13 +603,15 @@ def triton_dequantize_mx4(
         group_size (int): Size of chunks that use the same shared exponent.
         ebits (int): Number of bits to use for exponent in target mx4 format.
         mbits (int): Number of bits to use for mantissa in target mx4 format.
+        output_dtype (torch.dtype): Output dtype (FP32 or BF16).
+            Defaults to torch.float32 for backward compatibility.
     Returns:
-        torch.Tensor: [M, K] dequantized fp32 tensor.
+        torch.Tensor: [M, K] dequantized tensor in the specified dtype.
     """
     # If given an empty shape, return an empty tensor.
     if a.numel() == 0:
-        return torch.empty(a.shape, device=a.device, dtype=torch.float32)
+        return torch.empty(a.shape, device=a.device, dtype=output_dtype)
     # View a as 2D for simplicity.
     orig_shape = a.shape
     a = a.flatten()
@@ -622,9 +628,9 @@ def triton_dequantize_mx4(
     # Use a lookup table to convert
     mx4_to_fp_values = get_mx4_lookup_table(ebits, mbits, a.device)
-    # Create output tensor.
+    # Create output tensor in target dtype.
     output_elems = num_groups * group_size
-    out = torch.empty([output_elems], device=a.device, dtype=torch.float)
+    out = torch.empty([output_elems], device=a.device, dtype=output_dtype)
     # Check if we need to use int64 for indexing.
     use_int64 = num_threads * groups_per_thread * group_size > 2**31 - 1
     # Invoke triton dequantization kernel over rows.

fbgemm_gpu/utils/writeback_util.py ADDED Viewed

@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def writeback_update_gradient(
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    grad: torch.Tensor,
+    feature_table_map: list[int],
+) -> torch.Tensor:
+    """
+    Update gradient tensor by deduplicating indices across all features/tables.
+    For duplicate indices, only the first occurrence receives the gradient to achieve the assign purpose via gradient update
+    NOTE: This function is not supporting VBE yet
+    Args:
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        grad (torch.Tensor): Gradient tensor to be updated
+        feature_table_map (list[int]): Mapping from feature to table
+    Returns:
+        torch.Tensor: Updated gradient tensor with duplicates masked out
+    """
+    if indices.numel() == 0:
+        return grad[0]
+    # get num of feature to estimate batch size
+    num_of_tables = len(feature_table_map)
+    assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
+    batch_size = offsets.shape[0] // num_of_tables
+    max_indices = indices.max()
+    non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
+    # disable dedup across different table
+    indices = ((offsets[non_empty_index]) // batch_size) * (1 + max_indices) + indices
+    grad = grad[0]
+    _, idx, counts = torch.unique(
+        indices, dim=0, sorted=True, return_inverse=True, return_counts=True
+    )
+    _, ind_sorted = torch.sort(idx, stable=True)
+    cum_sum = counts.cumsum(0)
+    cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
+    first_indicies = ind_sorted[cum_sum]
+    mask = torch.zeros_like(grad, device=grad.device)
+    original_index = non_empty_index[first_indicies]
+    mask[original_index] = grad[original_index]
+    return mask
+def writeback_update_gradient_first_feature_only(
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    grad: torch.Tensor,
+    feature_table_map: list[int],
+) -> torch.Tensor:
+    """
+    Special case of writeback_update_gradient where gradient only needs to be updated for the first feature. Other features will be forward-only
+    NOTE: This function is not supporting VBE yet
+    Args:
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        grad (torch.Tensor): Gradient tensor to be updated
+        feature_table_map (list[int]): Mapping from feature to table
+    Returns:
+        torch.Tensor: Updated gradient tensor with duplicates masked out
+    """
+    num_of_tables = len(feature_table_map)
+    batch_size = (offsets.shape[0] - 1) // num_of_tables
+    shrink_indices = indices[: offsets[batch_size]]
+    if shrink_indices.numel() == 0 or indices.numel() == 0:
+        return grad[0]
+    assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
+    grad = grad[0]
+    _, idx, counts = torch.unique(
+        shrink_indices, dim=0, sorted=True, return_inverse=True, return_counts=True
+    )
+    _, ind_sorted = torch.sort(idx, stable=True)
+    cum_sum = counts.cumsum(0)
+    cum_sum = torch.cat((torch.tensor([0]).to(shrink_indices.device), cum_sum[:-1]))
+    first_indicies = ind_sorted[cum_sum]
+    mask = torch.zeros_like(grad, device=grad.device)
+    mask[first_indicies] = grad[first_indicies]
+    return mask
+def writeback_gradient(
+    grad: torch.Tensor,
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    feature_table_map: list[int],
+    writeback_first_feature_only: bool = False,
+) -> tuple[torch.Tensor]:
+    """
+    Compute deduplicated gradient for writeback operation.
+    Args:
+        grad (torch.Tensor): Gradient tensor to be updated
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        feature_table_map (list[int]): Mapping from feature to table
+        writeback_first_feature_only (bool): If True, only first feature will apply gradient update, other features will be read-only
+    Returns:
+        tuple[torch.Tensor]: Tuple containing the updated gradient tensor
+    """
+    if writeback_first_feature_only:
+        return (
+            writeback_update_gradient_first_feature_only(
+                indices, offsets, grad, feature_table_map
+            ),
+        )
+    else:
+        return (writeback_update_gradient(indices, offsets, grad, feature_table_map),)

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_genai_nightly
-Version: 2025.12.17
+Version: 2026.1.9
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/RECORD RENAMED Viewed

@@ -1,29 +1,29 @@
 fbgemm_gpu/__init__.py,sha256=bL2dL7uYeXb1GvdjIDUTcLXLRGNfmnI4MQoE3-Gg5m8,6361
-fbgemm_gpu/asmjit.so,sha256=231yAFvSUfy_B5xni9sAPQlsi5so9alFN3tXN7GFcMQ,484232
+fbgemm_gpu/asmjit.so,sha256=UxnhHlu9LgmoRXa8fZwSX56b5QKffBxfAOs0AZLxRfk,501728
 fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
 fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
-fbgemm_gpu/fbgemm.so,sha256=_fCdNktofSTSuedF0cLL3AKDTeKca5tty8RnRzKFCdg,5803160
+fbgemm_gpu/fbgemm.so,sha256=HQUXhk9ikCtui6125NmRjOJvAMaWRgZXLlBcQHsh2Xo,5646712
 fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
 fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
 fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
-fbgemm_gpu/quantize_comm.py,sha256=ZfXtRHfqpVpV6k2PDL6oTUkKYzopqAV2M6vavp_RLSM,12022
-fbgemm_gpu/quantize_utils.py,sha256=q8Aokk6nlHbXF6HcDBbhBCAGSZV4klM8uPF-MUFFtAw,8324
+fbgemm_gpu/quantize_comm.py,sha256=j4-wBqWRtXjhtQBKi7IOAftNDzv8-AeX9YXlD8e682c,11983
+fbgemm_gpu/quantize_utils.py,sha256=fK4Dk9Qpjsu4qASCwAxkLjbiFRLI71Hd-AtHY4NyMZ8,10200
 fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
 fbgemm_gpu/sparse_ops.py,sha256=_EJC1pAbNnAnVQQ5JBg4DAV2TboIj-4XQkiKMmg1vXI,50417
-fbgemm_gpu/split_embedding_configs.py,sha256=fv29efZGD_cvh5KwdvTFD6GZtqJLYjWXW_0vMeyT_6k,15483
+fbgemm_gpu/split_embedding_configs.py,sha256=EuVFKIDrgRQpRC5mmB4Du6WftK5GXJvDue9_ezt_eBI,16575
 fbgemm_gpu/split_embedding_inference_converter.py,sha256=AghGW22MgMsdHzdwdPMPYDjgas5AE_estckY8rMgXVU,7056
 fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
 fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
 fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
 fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=eFxb_bDfBV8G76pmd-SxDXXXnqgbuGYOS4pSU8JS5dg,19295
 fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
-fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=D72laY5iFC3_6f_qHnPMizDDxwI0QW7-21RyY0ZikK4,187705
-fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=e3O9ElaWBGvG7TdT3Ok_8cB06jhskXuyCQ0t40dzsEY,5449
+fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=rNGMELM_xFIsdS_340PB7bsn9h_VjONq_JJG1SjHyvQ,188992
+fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
 fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
 fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
 fbgemm_gpu/uvm.py,sha256=guNK8ZzR80jmv-CyRgEhxhVYhjz3R9d6tB8Hu1uWDUo,1047
 fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
-fbgemm_gpu/config/feature_list.py,sha256=iDOGr9nwTqUhWsqOefRIqIo1jwLSeSII4jGnLeU01kg,2359
+fbgemm_gpu/config/feature_list.py,sha256=hhDNkkafd-Oetvuqv9ylBVTNM-lKPi029mpRqq-JZCA,2467
 fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
 fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
 fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
@@ -32,9 +32,9 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
-fbgemm_gpu/docs/target.genai.json.py,sha256=Zzc84wR-3UYjzYFUQk2gX2r6FEia8mMClTg1gA1HVoc,79
+fbgemm_gpu/docs/target.genai.json.py,sha256=TVO8vYaBQPaEdT-bYeXlTdOGiTw4ceWeAWa9m9Wnerg,77
 fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
-fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=Mt99lNGcaYTxWVGqPP8Q2l-n_7lj2DNmPHura1eHAMM,183392
+fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=9NH_0L5RRD5NwIOILKiOGjKo25isXYYkmwFvbIlUGe0,190656
 fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
 fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
 fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=R4VNZdPSgmRmwDfTt2CShED2SGUF6dCXSUW2C4LISgE,215713
@@ -43,11 +43,11 @@ fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=5ClZ-GDrx6q0uaqW
 fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
 fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
 fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
-fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=rR2xW3Km17SqFFHLL-1WKIQ2hxd7-UpiEbEQmsvx8z8,64298336
+fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=_1HEs4E69AIPZ7zauPqcHB01egv5eNf2IKNZuihMOAA,65230568
 fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=-R_LxyHpdXMILU9TNuYoRisBCkfK0_VLyixefaeZf4g,1463
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=gbhNU3mDTKJb3yt3inIDbiUjX_SG1oZfzgDygtHvMpk,10101
-fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=r1AhV2qdIqxtYYeze6yr6_wg_Xzfzc4QJEBeNsGY4Gw,17570
+fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=fD39_WH7TfNCiP5Vl46ToX6PsLMLUFLhizT26Qe7TWg,17282
 fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=XpAK_eyqDSKeFC5J9KpnKtbZG07mrDh9d2j1LFKzr-8,404
 fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
 fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
@@ -99,7 +99,7 @@ fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29Xt
 fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
 fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
 fbgemm_gpu/tbe/ssd/inference.py,sha256=B_uX66ajGA9YKGlFa5TmGWs7b-b1RFigzwxmENZ9Oio,22816
-fbgemm_gpu/tbe/ssd/training.py,sha256=ElFvQHF5wQBzrqU34F6ZR2IEBVzKO3j3symntP15S3E,211380
+fbgemm_gpu/tbe/ssd/training.py,sha256=C6M3H_f8oWWRkC4R-BJED73au-Gl9SUVllxOoFSiDkI,212234
 fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
 fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
 fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
@@ -111,7 +111,7 @@ fbgemm_gpu/tbe/utils/quantize.py,sha256=icN2MXnl5rNqtKhGKkjpelx5pYBMYUv-6CrghxeV
 fbgemm_gpu/tbe/utils/requests.py,sha256=rQkEoaUUWEYCQM-1K_Lxg1wPcyIVw8sbdaGFTpsaE5I,18040
 fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
 fbgemm_gpu/triton/common.py,sha256=wnkLd2a8fKpefymLL-LjNKEL4hDVSxFiF5g3aF8mzsw,2131
-fbgemm_gpu/triton/quantize.py,sha256=z3y74-DCbGcQDsO70b2jK_HQDIYC0UJ7IEG2vvMu0_Y,26816
+fbgemm_gpu/triton/quantize.py,sha256=bjMPgcUOcuG_d9I_EjSCpkU3Fr5f3FU7CIWIqzc_N3w,27074
 fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2MWS80,11553
 fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
 fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=F2eQWjkWMR5RWQ48oIr-8OU_CRZyLazDpT7DFrDWS6g,29871
@@ -119,9 +119,10 @@ fbgemm_gpu/utils/__init__.py,sha256=JQQNdcTTaEU6ptK-OW-ZQBwTFxEZZpWOtBXWwEZm39o,
 fbgemm_gpu/utils/filestore.py,sha256=oVtbKGaPQki1JgbJCkrkElukOFVyxntQpSC0lYBKgho,6455
 fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,990
 fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
+fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
 list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
 list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/METADATA,sha256=oJzBJPiPBYhvls7W-MDbX-yBH6y4CRyGDHNvDFaAyBU,2657
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
-fbgemm_gpu_genai_nightly-2025.12.17.dist-info/RECORD,,
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/METADATA,sha256=WQ7sQGvWWGQao3Wcjk39i_bFY2BvrmWfGHxHgWxEsug,2655
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
+fbgemm_gpu_genai_nightly-2026.1.9.dist-info/RECORD,,

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_genai_nightly-2025.12.17.dist-info → fbgemm_gpu_genai_nightly-2026.1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes