PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/utils/requests.py CHANGED Viewed

@@ -8,7 +8,7 @@
 import logging
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
+from typing import Optional
 import numpy as np
 import numpy.typing as npt
@@ -32,20 +32,20 @@ class TBERequest:
     indices: torch.Tensor
     offsets: torch.Tensor
     per_sample_weights: Optional[torch.Tensor] = None
-    Bs_per_feature_per_rank: Optional[List[List[int]]] = None
+    Bs_per_feature_per_rank: Optional[list[list[int]]] = None
-    def unpack_2(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def unpack_2(self) -> tuple[torch.Tensor, torch.Tensor]:
         return (self.indices, self.offsets)
     def unpack_3(
         self,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         return (self.indices, self.offsets, self.per_sample_weights)
     def unpack_4(
         self,
-    ) -> Tuple[
-        torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[List[List[int]]]
+    ) -> tuple[
+        torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[list[list[int]]]
     ]:
         return (
             self.indices,
@@ -56,21 +56,36 @@ class TBERequest:
 def generate_requests_from_data_file(
-    requests_data_file: str,
     iters: int,
     B: int,
     T: int,
     L: int,
     E: int,
     weighted: bool,
+    requests_data_file: Optional[str] = None,
+    indices_file: Optional[str] = None,
+    offsets_file: Optional[str] = None,
     tables: Optional[str] = None,
     index_dtype: Optional[torch.dtype] = None,
     offset_dtype: Optional[torch.dtype] = None,
-) -> List[TBERequest]:
+) -> list[TBERequest]:
     """
-    Generate TBE requests from the input data file (`requests_data_file`)
+    Generate TBE requests from the input data file. If `requests_data_file` is provided,
+    `indices_file` and `offsets_file` should not be provided. If either `indices_file`
+    or `offsets_file` is provided, both must be provided.
     """
-    indices_tensor, offsets_tensor, lengths_tensor = torch.load(requests_data_file)
+    assert not (
+        requests_data_file and (indices_file or offsets_file)
+    ), "If requests_data_file is provided, indices_file and offsets_file cannot be provided."
+    assert (
+        indices_file and offsets_file
+    ), "Both indices_file and offsets_file must be provided if either is provided."
+    if requests_data_file:
+        indices_tensor, offsets_tensor, *rest = torch.load(requests_data_file)
+    else:
+        indices_tensor = torch.load(indices_file)
+        offsets_tensor = torch.load(offsets_file)
     average_L = 0
     if tables is not None:
@@ -104,7 +119,7 @@ def generate_requests_from_data_file(
         average_L = int((offsets_tensor[-1] - offsets_tensor[0]) / B)
         assert (np.prod(offsets_tensor.size()) - 1) == np.prod((T, B)), (
             f"Data file (indices = {indices_tensor.size()}, "
-            f"offsets = {offsets_tensor.size()}, lengths = {lengths_tensor.size()}) "
+            f"offsets = {offsets_tensor.size()}, lengths = {offsets_tensor.size() - 1}) "
             f"does not conform to inputs (T, B) = ({T}, {B})."
         )
@@ -163,12 +178,12 @@ def generate_int_data_from_stats(
 def generate_pooling_factors_from_stats(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     sigma_L: int,
     # distribution of pooling factors
     length_dist: str,
-) -> Tuple[int, torch.Tensor]:
+) -> tuple[int, torch.Tensor]:
     """
     Generate pooling factors for the TBE requests from the given stats
     """
@@ -196,7 +211,7 @@ def generate_batch_sizes_from_stats(
     vbe_num_ranks: int,
     # Distribution of batch sizes
     batch_size_dist: str,
-) -> Tuple[List[int], List[List[int]]]:
+) -> tuple[list[int], list[list[int]]]:
     """
     Generate batch sizes for features from the given stats
     """
@@ -219,7 +234,7 @@ def generate_batch_sizes_from_stats(
 def generate_indices_uniform(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     E: int,
     use_variable_L: bool,
@@ -237,7 +252,7 @@ def generate_indices_uniform(
         dtype=torch.int32,
     )
     # each bag is usually sorted
-    (indices, _) = torch.sort(indices)
+    indices, _ = torch.sort(indices)
     if use_variable_L:
         # 1D layout, where row offsets are determined by L_offsets
         indices = torch.ops.fbgemm.bottom_k_per_row(
@@ -252,7 +267,7 @@ def generate_indices_uniform(
 def generate_indices_zipf(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     E: int,
     alpha: float,
@@ -309,7 +324,7 @@ def generate_indices_zipf(
 def update_indices_with_random_reuse(
     iters: int,
-    Bs: List[int],
+    Bs: list[int],
     L: int,
     reuse: float,
     indices: torch.Tensor,
@@ -371,6 +386,9 @@ def generate_requests(  # noqa C901
     zipf_oversample_ratio: int = 3,
     weighted: bool = False,
     requests_data_file: Optional[str] = None,
+    # Path to file containing indices and offsets. If provided, this will be used
+    indices_file: Optional[str] = None,
+    offsets_file: Optional[str] = None,
     # Comma-separated list of table numbers
     tables: Optional[str] = None,
     # If sigma_L is not None, treat L as mu_L and generate Ls from sigma_L
@@ -393,21 +411,28 @@ def generate_requests(  # noqa C901
     vbe_num_ranks: Optional[int] = None,
     index_dtype: Optional[torch.dtype] = None,
     offset_dtype: Optional[torch.dtype] = None,
-) -> List[TBERequest]:
+) -> list[TBERequest]:
     # TODO: refactor and split into helper functions to separate load from file,
     # generate from distribution, and other future methods of generating data
-    if requests_data_file is not None:
+    if (
+        requests_data_file is not None
+        or indices_file is not None
+        or offsets_file is not None
+    ):
         assert sigma_L is None, "Variable pooling factors is not supported"
         assert sigma_B is None, "Variable batch sizes is not supported"
         return generate_requests_from_data_file(
-            requests_data_file,
-            iters,
-            B,
-            T,
-            L,
-            E,
-            weighted,
-            tables,
+            iters=iters,
+            B=B,
+            T=T,
+            L=L,
+            E=E,
+            weighted=weighted,
+            requests_data_file=requests_data_file,
+            indices_file=indices_file,
+            offsets_file=offsets_file,
+            tables=tables,
             index_dtype=index_dtype,
             offset_dtype=offset_dtype,
         )

fbgemm_gpu/tbe_input_multiplexer.py CHANGED Viewed

@@ -8,9 +8,8 @@
 # pyre-unsafe
 import abc
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional
 from torch import Tensor
@@ -22,15 +21,25 @@ class TBEInfo:
     Args:
         table_names: table names within the tbe
-        table_heights: table heights (hashsize)
+        table_heights: sharded table heights (hashsize)
         tbe_uuid: a unique identifier for the TBE
         feature_table_map: feature to table map
+        table_dims: sharded table dimensions
+        full_table_heights: table heights before sharding
+        full_table_dims: table dimensions before sharding
+        row_offset: the shard offset of the current rank on row (height)
+        col_offset: the shard offset of the current rank on column (dim)
     """
-    table_names: List[str]
-    table_heights: List[int]
+    table_names: list[str]
+    table_heights: list[int]
     tbe_uuid: str
-    feature_table_map: List[int]
+    feature_table_map: list[int]
+    table_dims: list[int]
+    full_table_heights: list[int]
+    full_table_dims: list[int]
+    row_offset: list[int]
+    col_offset: list[int]
 @dataclass(frozen=True)
@@ -45,7 +54,7 @@ class TBEInputInfo:
     indices: Tensor
     offsets: Tensor
-    batch_size_per_feature_per_rank: Optional[List[List[int]]] = None
+    batch_size_per_feature_per_rank: Optional[list[list[int]]] = None
 class TBEInputMultiplexer(abc.ABC):

fbgemm_gpu/triton/common.py CHANGED Viewed

@@ -10,7 +10,6 @@ from enum import IntEnum
 import torch
 # We keep LUTs persistent to minimize the number of device copies required.
 E2M1_LUT = torch.tensor(
     [0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6],

fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py CHANGED Viewed

@@ -9,7 +9,7 @@
 # pyre-ignore-all-errors[6]
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 import torch
 import triton  # @manual
@@ -472,7 +472,7 @@ def triton_jagged_to_dense_optimization_2d(
 # In FBGEMM it was computed by GPU but in triton currently has some compilation issue so we use CUP computation method as workaround
 # However in real-world case if we only dealing with 2d jagged tensor we don't need to use this function at all
 def _jagged_offsets_to_dense_indice(
-    offsets: List[torch.Tensor], dense_strides: List[int], dense_sizes: List[int]
+    offsets: list[torch.Tensor], dense_strides: list[int], dense_sizes: list[int]
 ) -> torch.Tensor:
     output_offset = torch.zeros(len(offsets[-1]) - 1, device="cpu", dtype=torch.int32)
@@ -532,8 +532,8 @@ def _jagged_offsets_to_dense_indice(
 # not be affected at all
 def jagged_to_dense(
     jagged_values: torch.Tensor,
-    jagged_offsets: List[torch.Tensor],
-    jagged_max_lengths: List[int],
+    jagged_offsets: list[torch.Tensor],
+    jagged_max_lengths: list[int],
     padding_value: float = 0.0,  # padding value currently use 0.0 as default value
     operation_function: Union[
         str, None
@@ -720,10 +720,10 @@ def triton_dense_to_jagged(
 def dense_to_jagged(
     dense: torch.Tensor,
-    jagged_offsets: List[torch.Tensor],
+    jagged_offsets: list[torch.Tensor],
     operation_function: Union[str, None] = None,
     operation_jagged_values: Union[torch.Tensor, None] = None,
-) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+) -> tuple[torch.Tensor, list[torch.Tensor]]:
     thread_block_row_size = 32
     thread_block_col_size = 32
@@ -780,7 +780,7 @@ def dense_to_jagged(
 # jagged_tensor + dense -> dense
 def jagged_dense_elementwise_add_dense_output(
     jagged_values: Tensor,
-    jagged_offsets: List[Tensor],
+    jagged_offsets: list[Tensor],
     # pyre-fixme[2]: Parameter must be annotated.
     dense,
 ) -> Tensor:
@@ -800,8 +800,8 @@ def jagged_dense_elementwise_add_dense_output(
 # jagged_tensor + dense -> jagged_tensor
 def jagged_dense_elementwise_add_jagged_output(
-    jagged_values: Optional[Tensor], jagged_offsets: List[Tensor], dense: Tensor
-) -> Tuple[Tensor, List[Tensor]]:
+    jagged_values: Optional[Tensor], jagged_offsets: list[Tensor], dense: Tensor
+) -> tuple[Tensor, list[Tensor]]:
     return dense_to_jagged(
         dense,
@@ -813,8 +813,8 @@ def jagged_dense_elementwise_add_jagged_output(
 # jagged_tensor * dense -> jagged_tensor
 def jagged_dense_elementwise_mul_jagged_output(
-    jagged_values: Optional[Tensor], jagged_offsets: List[Tensor], dense: Tensor
-) -> Tuple[Tensor, List[Tensor]]:
+    jagged_values: Optional[Tensor], jagged_offsets: list[Tensor], dense: Tensor
+) -> tuple[Tensor, list[Tensor]]:
     return dense_to_jagged(
         dense,

fbgemm_gpu/triton/quantize.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import Union
 import torch
 import triton  # @manual
 import triton.language as tl  # @manual
 from .common import get_mx4_exp_bias, get_mx4_lookup_table, RoundingMode
@@ -238,7 +237,7 @@ def _kernel_quantize_mx4(
         # We readd fp32_exp_bias for compatibility with cuda dequant.
         tl.store(
             out + exp_offset,
-            (group_exp + FP32_EXP_BIAS).to(tl.int8),
+            (group_exp + FP32_EXP_BIAS).to(tl.uint8),
             # Prevent writing outside this chunk or the main array.
             mask=(exp_offset < OUTPUT_SIZE)
             & (exp_offset < (OUTPUT_CHUNK_SIZE * (pid + 1))),
@@ -575,7 +574,7 @@ def _kernel_dequantize_mx4(
         # Write final outputs.
         tl.store(
             out + output_offset,
-            scaled_fp32,
+            scaled_fp32.to(out.dtype.element_ty),
             # Mask values that are out of this chunk or the main array.
             mask=(output_offset < OUTPUT_SIZE)
             & (output_offset < OUTPUT_CHUNK_SIZE * (pid + 1)),
@@ -588,10 +587,14 @@ def _kernel_dequantize_mx4(
 def triton_dequantize_mx4(
-    a: torch.Tensor, group_size: int = 32, ebits: int = 2, mbits: int = 1
+    a: torch.Tensor,
+    group_size: int = 32,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """
-    Dequantize a tensor from mx4 format to fp32.
+    Dequantize a tensor from mx4 format to fp32 or bf16.
     Args:
         a (Tensor): [M / 2 + M / group_size] MX4 tensor packed into int8 values
@@ -599,13 +602,15 @@ def triton_dequantize_mx4(
         group_size (int): Size of chunks that use the same shared exponent.
         ebits (int): Number of bits to use for exponent in target mx4 format.
         mbits (int): Number of bits to use for mantissa in target mx4 format.
+        output_dtype (torch.dtype): Output dtype (FP32 or BF16).
+            Defaults to torch.float32 for backward compatibility.
     Returns:
-        torch.Tensor: [M, K] dequantized fp32 tensor.
+        torch.Tensor: [M, K] dequantized tensor in the specified dtype.
     """
     # If given an empty shape, return an empty tensor.
     if a.numel() == 0:
-        return torch.empty(a.shape, device=a.device, dtype=torch.float32)
+        return torch.empty(a.shape, device=a.device, dtype=output_dtype)
     # View a as 2D for simplicity.
     orig_shape = a.shape
     a = a.flatten()
@@ -622,9 +627,9 @@ def triton_dequantize_mx4(
     # Use a lookup table to convert
     mx4_to_fp_values = get_mx4_lookup_table(ebits, mbits, a.device)
-    # Create output tensor.
+    # Create output tensor in target dtype.
     output_elems = num_groups * group_size
-    out = torch.empty([output_elems], device=a.device, dtype=torch.float)
+    out = torch.empty([output_elems], device=a.device, dtype=output_dtype)
     # Check if we need to use int64 for indexing.
     use_int64 = num_threads * groups_per_thread * group_size > 2**31 - 1
     # Invoke triton dequantization kernel over rows.

fbgemm_gpu/utils/filestore.py CHANGED Viewed

@@ -11,7 +11,6 @@
 import io
 import logging
 import os
-import shutil
 from dataclasses import dataclass
 from pathlib import Path
 from typing import BinaryIO, Union
@@ -36,8 +35,6 @@ class FileStore:
     bucket: str
     def __post_init__(self) -> None:
-        # self.bucket = bucket
         if not os.path.isdir(self.bucket):
             raise ValueError(f"Directory {self.bucket} does not exist")
@@ -78,7 +75,12 @@ class FileStore:
             elif isinstance(raw_input, Path):
                 if not os.path.exists(raw_input):
                     raise FileNotFoundError(f"File {raw_input} does not exist")
-                shutil.copyfile(raw_input, filepath)
+                # Open the source file and destination file, and copy the contents
+                with open(raw_input, "rb") as src_file, open(
+                    filepath, "wb"
+                ) as dst_file:
+                    while chunk := src_file.read(4096):  # Read 4 KB at a time
+                        dst_file.write(chunk)
             elif isinstance(raw_input, io.BytesIO) or isinstance(raw_input, BinaryIO):
                 with open(filepath, "wb") as file:
@@ -157,4 +159,53 @@ class FileStore:
             True if file exists, False otherwise.
         """
         filepath = f"{self.bucket}/{path}"
-        return os.path.isfile(filepath)
+        return os.path.exists(filepath)
+    def create_directory(self, path: str) -> "FileStore":
+        """
+        Creates a directory in the file store.
+        Args:
+            path (str): The path of the node or symlink to a directory (relative
+            to `self.bucket`) to be created.
+        Returns:
+            self.  This allows for method-chaining.
+        """
+        filepath = f"{self.bucket}/{path}"
+        event = f"creating directory {filepath}"
+        logger.info(f"FileStore: {event}")
+        try:
+            if not os.path.exists(filepath):
+                os.makedirs(filepath, exist_ok=True)
+        except Exception as e:
+            logger.error(f"FileStore: exception occurred when {event}: {e}")
+            raise e
+        return self
+    def remove_directory(self, path: str) -> "FileStore":
+        """
+        Removes a directory from the file store.
+        Args:
+            path (str): The path of the node or symlink to a directory (relative
+            to `self.bucket`) to be removed.
+        Returns:
+            self.  This allows for method-chaining.
+        """
+        filepath = f"{self.bucket}/{path}"
+        event = f"deleting {filepath}"
+        logger.info(f"FileStore: {event}")
+        try:
+            if os.path.isdir(filepath):
+                os.rmdir(filepath)
+        except Exception as e:
+            logger.error(f"Manifold: exception occurred when {event}: {e}")
+            raise e
+        return self

fbgemm_gpu/utils/torch_library.py CHANGED Viewed

@@ -8,7 +8,7 @@
 # pyre-strict
 import re
-from typing import Callable, Dict
+from typing import Callable
 import torch
@@ -112,7 +112,7 @@ class TorchLibraryFragment:
                 self.lib.impl(op_name, fn, dispatch_key)
     # pyre-ignore[24]
-    def register(self, op_name: str, functors: Dict[str, Callable]) -> None:
+    def register(self, op_name: str, functors: dict[str, Callable]) -> None:
         """
         Registers a set of dispatches for a defined operator.

fbgemm_gpu/utils/writeback_util.py ADDED Viewed

@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def writeback_update_gradient(
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    grad: torch.Tensor,
+    feature_table_map: list[int],
+) -> torch.Tensor:
+    """
+    Update gradient tensor by deduplicating indices across all features/tables.
+    For duplicate indices, only the first occurrence receives the gradient to achieve the assign purpose via gradient update
+    NOTE: This function is not supporting VBE yet
+    Args:
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        grad (torch.Tensor): Gradient tensor to be updated
+        feature_table_map (list[int]): Mapping from feature to table
+    Returns:
+        torch.Tensor: Updated gradient tensor with duplicates masked out
+    """
+    if indices.numel() == 0:
+        return grad[0]
+    # get num of feature to estimate batch size
+    num_of_tables = len(feature_table_map)
+    assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
+    batch_size = offsets.shape[0] // num_of_tables
+    max_indices = indices.max()
+    non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
+    # disable dedup across different table
+    indices = ((offsets[non_empty_index]) // batch_size) * (1 + max_indices) + indices
+    grad = grad[0]
+    _, idx, counts = torch.unique(
+        indices, dim=0, sorted=True, return_inverse=True, return_counts=True
+    )
+    _, ind_sorted = torch.sort(idx, stable=True)
+    cum_sum = counts.cumsum(0)
+    cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
+    first_indicies = ind_sorted[cum_sum]
+    mask = torch.zeros_like(grad, device=grad.device)
+    original_index = non_empty_index[first_indicies]
+    mask[original_index] = grad[original_index]
+    return mask
+def writeback_update_gradient_first_feature_only(
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    grad: torch.Tensor,
+    feature_table_map: list[int],
+) -> torch.Tensor:
+    """
+    Special case of writeback_update_gradient where gradient only needs to be updated for the first feature. Other features will be forward-only
+    NOTE: This function is not supporting VBE yet
+    Args:
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        grad (torch.Tensor): Gradient tensor to be updated
+        feature_table_map (list[int]): Mapping from feature to table
+    Returns:
+        torch.Tensor: Updated gradient tensor with duplicates masked out
+    """
+    num_of_tables = len(feature_table_map)
+    batch_size = (offsets.shape[0] - 1) // num_of_tables
+    shrink_indices = indices[: offsets[batch_size]]
+    if shrink_indices.numel() == 0 or indices.numel() == 0:
+        return grad[0]
+    assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
+    grad = grad[0]
+    _, idx, counts = torch.unique(
+        shrink_indices, dim=0, sorted=True, return_inverse=True, return_counts=True
+    )
+    _, ind_sorted = torch.sort(idx, stable=True)
+    cum_sum = counts.cumsum(0)
+    cum_sum = torch.cat((torch.tensor([0]).to(shrink_indices.device), cum_sum[:-1]))
+    first_indicies = ind_sorted[cum_sum]
+    mask = torch.zeros_like(grad, device=grad.device)
+    mask[first_indicies] = grad[first_indicies]
+    return mask
+def writeback_gradient(
+    grad: torch.Tensor,
+    indices: torch.Tensor,
+    offsets: torch.Tensor,
+    feature_table_map: list[int],
+    writeback_first_feature_only: bool = False,
+) -> tuple[torch.Tensor]:
+    """
+    Compute deduplicated gradient for writeback operation.
+    Args:
+        grad (torch.Tensor): Gradient tensor to be updated
+        indices (torch.Tensor): Embedding indices tensor
+        offsets (torch.Tensor): Offsets tensor for batched embeddings
+        feature_table_map (list[int]): Mapping from feature to table
+        writeback_first_feature_only (bool): If True, only first feature will apply gradient update, other features will be read-only
+    Returns:
+        tuple[torch.Tensor]: Tuple containing the updated gradient tensor
+    """
+    if writeback_first_feature_only:
+        return (
+            writeback_update_gradient_first_feature_only(
+                indices, offsets, grad, feature_table_map
+            ),
+        )
+    else:
+        return (writeback_update_gradient(indices, offsets, grad, feature_table_map),)

fbgemm_gpu/uvm.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import Optional
 import torch
+# fmt:skip
 from fbgemm_gpu.enums import create_enums
 try:
@@ -21,6 +22,8 @@ except Exception:
     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:cumem_utils")
 # Import all uvm enums from c++ library
+# pyre-fixme[6]: For 2nd argument expected `() -> List[Tuple[str, List[Tuple[str,
+#  int]]]]` but got `OpOverloadPacket`.
 create_enums(globals(), torch.ops.fbgemm.fbgemm_gpu_uvm_enum_query)

{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: fbgemm_gpu_nightly-cpu
-Version: 2025.3.27
+Version: 2026.1.29
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org
@@ -12,11 +12,11 @@ Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: BSD License
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Description-Content-Type: text/markdown
 Requires-Dist: numpy
 Dynamic: author
@@ -40,9 +40,6 @@ PyTorch GPU operator libraries for training and inference.  The library provides
 efficient table batched embedding bag, data layout transformation, and
 quantization supports.
-FBGEMM_GPU is currently tested with CUDA 12.4 and 11.8 in CI, and with PyTorch
-packages (2.1+) that are built against those CUDA versions.
 See the full [Documentation](https://pytorch.org/FBGEMM) for more information
 on building, installing, and developing with FBGEMM_GPU, as well as the most
 up-to-date support matrix for this library.