PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/sparse_ops.py CHANGED Viewed

@@ -7,10 +7,12 @@
 # pyre-strict
 import math
-from typing import Callable, List, Optional, Sequence, Tuple
+from collections.abc import Sequence
+from typing import Callable, Optional
 import torch
+# fmt:skip
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
 from fbgemm_gpu.utils.loader import load_torch_module
@@ -48,8 +50,7 @@ except Exception:
 import torch.utils._pytree as pytree
 from torch import SymInt, Tensor
-from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+from torch.fx.experimental.symbolic_shapes import guard_or_true
 if hasattr(torch.library, "register_fake"):
     # pyre-ignore[9]
@@ -74,7 +75,7 @@ def permute_2D_sparse_data_input1D_meta(
     stride: int,
     weights: Optional[Tensor] = None,
     permuted_lengths_sum: Optional[int] = None,
-) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+) -> tuple[Tensor, Tensor, Optional[Tensor]]:
     torch._check(
         lengths.dim() == 1, lambda: f"expected lengths.dim() == 1, got {lengths.dim()}"
     )
@@ -111,7 +112,7 @@ def permute_2D_sparse_data_input1D_backward(
     grad_lengths: torch.Tensor,
     grad_values: torch.Tensor,
     grad_weights: torch.Tensor,
-) -> Tuple[None, Tensor, Tensor, None, Tensor, None]:
+) -> tuple[None, Tensor, Tensor, None, Tensor, None]:
     inv_permute = torch.ops.fbgemm.invert_permute(ctx.permute)
     permuted_grad_lengths, permuted_grad_values, permuted_grad_weights = (
         torch.ops.fbgemm.permute_2D_sparse_data_input1D(
@@ -139,7 +140,7 @@ def permute_2D_sparse_data_meta(
     values: Tensor,
     weights: Optional[Tensor] = None,
     permuted_lengths_sum: Optional[int] = None,
-) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+) -> tuple[Tensor, Tensor, Optional[Tensor]]:
     torch._check(
         lengths.dim() == 2, lambda: f"expected lengths.dim() == 2, got {lengths.dim()}"
     )
@@ -166,6 +167,89 @@ def invert_permute_abstract(permute: Tensor) -> Tensor:
     return torch.empty_like(permute)
+def get_source_mask_meta(
+    num_sources: Tensor, num_targets: Tensor, output_size: Optional[int] = None
+) -> Tensor:
+    if output_size is None:
+        ctx = torch.library.get_ctx()
+        output_size = ctx.new_dynamic_size()
+    return torch.empty([output_size], dtype=torch.bool)
+def get_source_mask(
+    num_sources: Tensor, num_targets: Tensor, output_size: Optional[int] = None
+) -> Tensor:
+    """
+    Generate a boolean mask indicating which elements are from sources vs targets.
+    This is a Python wrapper that computes output_size when not provided,
+    enabling the operation to work with meta tensors for compilation.
+    Args:
+        num_sources: 1D tensor of source counts per batch element
+        num_targets: 1D tensor of target counts per batch element
+        output_size: Optional pre-computed output size.
+    Returns:
+        A 1D boolean tensor where True indicates source elements and False
+        indicates target elements
+    Example:
+        >>> num_sources = torch.tensor([2, 3])
+        >>> num_targets = torch.tensor([1, 2])
+        >>> get_source_mask(num_sources, num_targets)
+        tensor([True, True, False, True, True, True, False, False])
+    """
+    # Compute output_size if not provided and tensors are regular (not meta/fake)
+    if output_size is None:
+        combined = num_sources + num_targets
+        output_size = int(combined.sum().item())
+    return torch.ops.fbgemm.get_source_mask(num_sources, num_targets, output_size)
+def repeat_arange_meta(lengths: Tensor) -> Tensor:
+    """Meta implementation for repeat_arange."""
+    # Output size is data-dependent (sum of lengths).
+    # For FakeTensors (used in torch.compile), we use dynamic sizing.
+    # For actual meta tensors, we cannot determine the size so return empty.
+    if lengths.device.type == "meta":
+        # Actual meta tensors: return a zero-sized tensor as placeholder
+        # since we cannot compute the data-dependent output size
+        return torch.empty([0], dtype=lengths.dtype, device=lengths.device)
+    else:
+        # FakeTensor context: use dynamic sizing for proper shape tracking
+        ctx = torch.library.get_ctx()
+        output_size = ctx.new_dynamic_size()
+        return torch.empty([output_size], dtype=lengths.dtype, device=lengths.device)
+def repeat_arange(lengths: Tensor) -> Tensor:
+    """
+    Creates a concatenated tensor of aranges based on a lengths tensor.
+    This is a high-performance CUDA kernel that replaces the inefficient PyTorch
+    implementation which uses 4+ separate kernels (cumsum, arange, repeat_interleave, sub).
+    Args:
+        lengths: 1D tensor of lengths for each arange sequence
+    Returns:
+        A 1D tensor containing concatenated arange sequences
+    Example:
+        >>> lengths = torch.tensor([3, 5, 2])
+        >>> repeat_arange(lengths)
+        tensor([0, 1, 2, 0, 1, 2, 3, 4, 0, 1])
+    Performance:
+        - PyTorch implementation: 4+ kernel launches + intermediate allocations
+        - CUDA implementation: 1 fused kernel, no intermediate allocations
+        - Typical speedup: 3-5x on realistic workloads
+    """
+    return torch.ops.fbgemm.repeat_arange(lengths)
 # pyre-ignore
 def permute_2D_sparse_data_setup_context(ctx, inputs, output):
     permute, lengths, values, weights, permuted_lengths_sum = inputs
@@ -197,7 +281,7 @@ def permute_1D_sparse_data_meta(
     values: Tensor,
     weights: Optional[Tensor] = None,
     permuted_lengths_sum: Optional[int] = None,
-) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+) -> tuple[Tensor, Tensor, Optional[Tensor]]:
     indices = values
     permuted_lengths_size = permute.numel()
     permuted_lengths = lengths.new_empty([permuted_lengths_size])
@@ -218,7 +302,7 @@ def permute_1D_sparse_data_meta(
 def masked_select_jagged_1d(
     values: Tensor, lengths: Tensor, mask: Tensor
-) -> Tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor]:
     torch._check(values.dim() == 1)
     torch._check(lengths.dim() == 1)
     torch._check(values.device == lengths.device)
@@ -231,11 +315,11 @@ def masked_select_jagged_1d(
 def tbe_input_combine_abstract(
-    indices_list: List[Tensor],
-    offsets_list: List[Tensor],
-    per_sample_weights: List[Tensor],
+    indices_list: list[Tensor],
+    offsets_list: list[Tensor],
+    per_sample_weights: list[Tensor],
     include_last_offsets: Tensor,
-) -> Tuple[Tensor, Tensor, Tensor]:
+) -> tuple[Tensor, Tensor, Tensor]:
     torch._check(len(indices_list) > 0)
     torch._check(len(indices_list) == len(offsets_list))
     torch._check(len(indices_list) == len(per_sample_weights))
@@ -250,7 +334,7 @@ def tbe_input_combine_abstract(
         torch._check(index.is_contiguous())
         torch._check(offset.is_contiguous())
         total_indices = total_indices + index.numel()
-        if guard_size_oblivious(weight.numel() > 0):
+        if guard_or_true(weight.numel() > 0):
             torch._check(weight.dim() == 1)
             torch._check(weight.numel() == index.numel())
             torch._check(weight.is_contiguous())
@@ -268,10 +352,10 @@ def tbe_input_combine_abstract(
 def tbe_input_combine_with_length_abstract(
-    indices_list: List[Tensor],
-    offsets_list: List[Tensor],
-    per_sample_weights: List[Tensor],
-) -> Tuple[Tensor, Tensor, Tensor]:
+    indices_list: list[Tensor],
+    offsets_list: list[Tensor],
+    per_sample_weights: list[Tensor],
+) -> tuple[Tensor, Tensor, Tensor]:
     torch._check(len(indices_list) > 0)
     torch._check(len(indices_list) == len(offsets_list))
     torch._check(len(indices_list) == len(per_sample_weights))
@@ -287,7 +371,7 @@ def tbe_input_combine_with_length_abstract(
         torch._check(offset.is_contiguous())
         total_indices = total_indices + index.numel()
         total_offsets = total_offsets + offset.numel()
-        if guard_size_oblivious(weight.numel() > 0):
+        if guard_or_true(weight.numel() > 0):
             torch._check(weight.dim() == 1)
             torch._check(weight.numel() == index.numel())
             torch._check(weight.is_contiguous())
@@ -339,7 +423,7 @@ def expand_into_jagged_permute_meta(
     permute: Tensor,
     input_offsets: Tensor,
     output_offsets: Tensor,
-    output_size: Tuple[int, ...],
+    output_size: tuple[int, ...],
 ) -> Tensor:
     torch._check(permute.numel() > 0, lambda: "expected {permute.numel} > 0")
     torch._check(
@@ -465,7 +549,7 @@ def block_bucketize_sparse_features_meta(
     keep_orig_idx: bool = False,
     total_num_blocks: Optional[torch.Tensor] = None,
     keep_orig_idx_per_feature: Optional[torch.Tensor] = None,
-) -> Tuple[
+) -> tuple[
     torch.Tensor,
     torch.Tensor,
     Optional[torch.Tensor],
@@ -485,8 +569,43 @@ def block_bucketize_sparse_features_meta(
     )
+def block_bucketize_sparse_features_2d_weights_meta(
+    lengths: torch.Tensor,
+    indices: torch.Tensor,
+    bucketize_pos: bool,
+    sequence: bool,
+    block_sizes: torch.Tensor,
+    my_size: int,
+    weights: torch.Tensor,
+    weights_dim: int = 1,
+    batch_size_per_feature: Optional[torch.Tensor] = None,
+    max_B: int = -1,
+    block_bucketize_pos: Optional[torch.Tensor] = None,
+    keep_orig_idx: bool = False,
+    total_num_blocks: Optional[torch.Tensor] = None,
+    keep_orig_idx_per_feature: Optional[torch.Tensor] = None,
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+]:
+    # Output: lengths, indices, weights", pos?, unbucketize_permute?
+    num_buckets = my_size
+    num_features = lengths.size(0)
+    num_values = indices.size(0)
+    return (
+        lengths.new_empty([num_buckets * num_features]),
+        indices.new_empty([num_values]),
+        weights.new_empty([num_values, weights_dim]),
+        indices.new_empty([num_values]) if bucketize_pos else None,
+        indices.new_empty([num_values]),
+    )
 def merge_pooled_embeddings(
-    pooled_embeddings: List[torch.Tensor],
+    pooled_embeddings: list[torch.Tensor],
     uncat_dim_size: int,
     target_device: torch.device,
     cat_dim: int = 1,
@@ -517,7 +636,7 @@ def merge_pooled_embeddings(
 def permute_sparse_features_abstract(
     permute: Tensor, lengths: Tensor, indices: Tensor, weights: Optional[Tensor] = None
-) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+) -> tuple[Tensor, Tensor, Optional[Tensor]]:
     torch._check(lengths.dtype == indices.dtype)
     torch._check(permute.device == lengths.device)
     torch._check(permute.device == indices.device)
@@ -548,7 +667,7 @@ def segment_sum_csr_abstract(
 def dense_to_jagged_forward(
     dense: torch.Tensor,
-    offsets: List[torch.Tensor],
+    offsets: list[torch.Tensor],
     total_L: Optional[torch.SymInt] = None,
 ) -> torch.Tensor:
     if total_L is None:
@@ -563,9 +682,9 @@ def dense_to_jagged_forward(
 def dense_to_jagged(
     dense: torch.Tensor,
-    offsets: List[torch.Tensor],
+    offsets: list[torch.Tensor],
     total_L: Optional[torch.SymInt] = None,
-) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+) -> tuple[torch.Tensor, list[torch.Tensor]]:
     if total_L is None:
         total_L = torch.library.get_ctx().new_dynamic_size()
     return (dense_to_jagged_forward(dense, offsets, total_L), offsets)
@@ -574,9 +693,9 @@ def dense_to_jagged(
 def batch_index_select_dim0_abstract(
     inputs: torch.Tensor,
     indices: torch.Tensor,
-    input_num_indices: List[int],
-    input_rows: List[int],
-    input_columns: List[int],
+    input_num_indices: list[int],
+    input_rows: list[int],
+    input_columns: list[int],
     permute_output_dim_0_1: bool,
 ) -> torch.Tensor:
     """
@@ -618,11 +737,11 @@ def batch_index_select_dim0_tensor_abstract(
 def batch_index_select_dim0_forward_cuda_impl_abstract(
     inputs: torch.Tensor,
     indices: torch.Tensor,
-    input_num_indices: List[int],
-    input_rows: List[int],
-    input_columns: List[int],
+    input_num_indices: list[int],
+    input_rows: list[int],
+    input_columns: list[int],
     permute_output_dim_0_1: bool,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     num_inputs = len(input_rows)
     torch._check(len(input_num_indices) == len(input_rows))
     torch._check(len(input_num_indices) == len(input_columns))
@@ -659,7 +778,7 @@ def batch_index_select_dim0_tensor_forward_cuda_impl_abstract(
     input_rows: torch.Tensor,
     input_columns: torch.Tensor,
     permute_output_dim_0_1: bool,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     num_inputs: int = input_rows.size(0)
     torch._check(input_num_indices.size(0) == input_rows.size(0))
     torch._check(input_num_indices.size(0) == input_columns.size(0))
@@ -704,7 +823,7 @@ def keyed_jagged_index_select_dim1_abstract(
     batch_size: torch.SymInt,
     weights: Optional[torch.Tensor] = None,
     selected_lengths_sum: Optional[torch.SymInt] = None,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     """
     This meta function is used to calculate the shape of output tensors
     from the original function `fbgemm::keyed_jagged_index_select_dim1` without the actual data.
@@ -729,7 +848,7 @@ def keyed_jagged_index_select_dim1_abstract(
             torch.index_select(lengths, 0, length_indices).sum().item()
         )
-    ret: List[torch.Tensor] = [
+    ret: list[torch.Tensor] = [
         # pyre-ignore
         values.new_empty([selected_lengths_sum]),
         lengths.new_empty([indices.shape[0] * num_batches]),
@@ -761,17 +880,17 @@ def batch_index_select_dim0_backward_cuda_impl_abstract(
 def batch_index_select_dim0_forward_cpu_impl_abstract(
     inputs: torch.Tensor,
     indices: torch.Tensor,
-    input_num_indices: List[int],
-    input_rows: List[int],
-    input_columns: List[int],
+    input_num_indices: list[int],
+    input_rows: list[int],
+    input_columns: list[int],
     permute_output_dim_0_1: bool,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     # input lists must have the same length
     num_inputs = len(input_num_indices)
     torch._check(num_inputs == len(input_rows))
     torch._check(num_inputs == len(input_columns))
-    if permute_output_dim_0_1 and guard_size_oblivious(len(input_num_indices) > 0):
+    if permute_output_dim_0_1 and guard_or_true(len(input_num_indices) > 0):
         # All num_indices must be the same if permute_output_dim_0_1 is True
         for x in input_num_indices:
             torch._check(x == input_num_indices[0])
@@ -795,7 +914,7 @@ def batch_index_select_dim0_tensor_forward_cpu_impl_abstract(
     input_rows: torch.Tensor,
     input_columns: torch.Tensor,
     permute_output_dim_0_1: bool,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     # input lists must have the same length
     num_inputs = len(input_num_indices)
     torch._check(num_inputs == len(input_rows))
@@ -845,8 +964,8 @@ def bounds_check_indices_abstract(
 def group_index_select_dim0_gpu_impl_abstract(
-    inputs: List[torch.Tensor], group_size: int
-) -> List[torch.Tensor]:
+    inputs: list[torch.Tensor], group_size: int
+) -> list[torch.Tensor]:
     """
     Calculate output shapes for group_index_select_dim0_gpu_impl
     without the actual data.
@@ -876,8 +995,8 @@ def group_index_select_dim0_gpu_impl_abstract(
 def group_index_select_dim0_gpu_backward_abstract(
-    all_inputs: List[torch.Tensor], output_shape_group_ref: List[torch.SymInt]
-) -> List[torch.Tensor]:
+    all_inputs: list[torch.Tensor], output_shape_group_ref: list[torch.SymInt]
+) -> list[torch.Tensor]:
     """
     Calculate output shapes for group_index_select_dim0_gpu_backward
     without the actual data.
@@ -910,7 +1029,7 @@ def keyed_jagged_index_select_dim1_forward_cuda_impl_abstract(
     batch_size: torch.SymInt,
     weights: Optional[torch.Tensor] = None,
     selected_lengths_sum: Optional[torch.SymInt] = None,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     num_batches = lengths.size(0) // batch_size
     torch._check(lengths.size(0) + 1 == offsets.size(0))
     # pyre-ignore
@@ -924,7 +1043,7 @@ def keyed_jagged_index_select_dim1_forward_cuda_impl_abstract(
         selected_lengths_sum = torch.library.get_ctx().new_dynamic_size()
     torch._check_is_size(selected_lengths_sum)
-    vlw: List[torch.Tensor] = [
+    vlw: list[torch.Tensor] = [
         values.new_empty([selected_lengths_sum]),  # output
         lengths.new_empty([indices.shape[0] * num_batches]),  # output_lengths
     ]
@@ -967,7 +1086,7 @@ def histogram_binning_calibration_abstract(
     upper_bound: float,
     bin_ctr_in_use_after: int,
     bin_ctr_weight_value: float,
-) -> Tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor]:
     return torch.empty_like(logit), torch.empty([logit.numel()], dtype=torch.int64)
@@ -1118,7 +1237,7 @@ def generic_histogram_binning_calibration_by_feature(
     positive_weight: float,
     bin_ctr_in_use_after: int,
     bin_ctr_weight_value: float,
-) -> Tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor]:
     torch._check(bin_num_examples.numel() == bin_num_positives.numel())
     torch._check(
         bin_num_examples.numel() == (num_segments + 1) * (bin_boundaries.numel() + 1)
@@ -1129,13 +1248,13 @@ def generic_histogram_binning_calibration_by_feature(
 def permute_multi_embedding_function_impl_abstract(
-    pooled_embs: List[Tensor],
+    pooled_embs: list[Tensor],
     permutes: Tensor,
     in_shapes: Tensor,
     out_shapes: Tensor,
-    out_lengths: List[int],
+    out_lengths: list[int],
     reverse: bool = False,
-) -> List[Tensor]:
+) -> list[Tensor]:
     out_dtype = pooled_embs[0].dtype
     bs = pooled_embs[0].shape[0]
     torch._check(permutes.shape[1] == 6, lambda: "permutes must have 6 columns")
@@ -1161,15 +1280,25 @@ def lengths_range_abstract(
 def all_to_one_device(
-    input_tensors: List[Tensor],
+    input_tensors: list[Tensor],
     target_device: torch.device,
-) -> List[Tensor]:
+) -> list[Tensor]:
     return [
         torch.empty_like(input_tensor, device=torch.device("meta"))
         for input_tensor in input_tensors
     ]
+def sum_reduce_to_one(
+    input_tensors: list[Tensor],
+    target_device: torch.device,
+) -> Tensor:
+    torch._check(len(input_tensors) > 0, lambda: "reducing no tensor is undefined")
+    # All tensors should have the same shape
+    first_tensor = input_tensors[0]
+    return torch.empty_like(first_tensor, device=torch.device("meta"))
 def _setup() -> None:
     # pyre-ignore[16]
     _setup.done = getattr(_setup, "done", False)
@@ -1202,6 +1331,8 @@ def _setup() -> None:
         )
         impl_abstract("fbgemm::permute_2D_sparse_data", permute_2D_sparse_data_meta)
+        impl_abstract("fbgemm::get_source_mask", get_source_mask_meta)
+        impl_abstract("fbgemm::repeat_arange", repeat_arange_meta)
         impl_abstract(
             "fbgemm::permute_2D_sparse_data_input1D",
             permute_2D_sparse_data_input1D_meta,
@@ -1234,6 +1365,10 @@ def _setup() -> None:
             "fbgemm::block_bucketize_sparse_features",
             block_bucketize_sparse_features_meta,
         )
+        impl_abstract(
+            "fbgemm::block_bucketize_sparse_features_2d_weights",
+            block_bucketize_sparse_features_2d_weights_meta,
+        )
         impl_abstract("fbgemm::merge_pooled_embeddings", merge_pooled_embeddings)
         impl_abstract(
             "fbgemm::permute_sparse_features", permute_sparse_features_abstract
@@ -1241,6 +1376,7 @@ def _setup() -> None:
         impl_abstract("fbgemm::segment_sum_csr", segment_sum_csr_abstract)
         impl_abstract("fbgemm::dense_to_jagged_forward", dense_to_jagged_forward)
         impl_abstract("fbgemm::all_to_one_device", all_to_one_device)
+        impl_abstract("fbgemm::sum_reduce_to_one", sum_reduce_to_one)
         impl_abstract(
             "fbgemm::batch_index_select_dim0", batch_index_select_dim0_abstract
         )

fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py CHANGED Viewed

@@ -152,6 +152,18 @@ except:
         DeprecationWarning,
     )
+try:
+    # Import is placed under a try-except bc the op is experimental and can be
+    # removed/updated in the future
+    import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_adam_ssd as lookup_adam_ssd  # noqa: F401
+except:
+    warnings.warn(
+        f"""\033[93m
+        Failed to import: fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_adam_ssd
+        \033[0m""",
+        DeprecationWarning,
+    )
 try:
     # Import is placed under a try-except bc the op is experimental and can be
     # removed/updated in the future

fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py CHANGED Viewed

@@ -56,14 +56,15 @@ def invoke(
         "vbe_B_offsets_rank_per_feature": vbe_metadata.B_offsets_rank_per_feature,
         "lxu_cache_locations": common_args.lxu_cache_locations,
         "uvm_cache_stats": common_args.uvm_cache_stats,
+        "vbe_output_offsets" : vbe_metadata.vbe_output_offsets,
     }
     dict_aux_int: Dict[str, int] = {
-        "iter": iter,
-        "info_B_num_bits": common_args.info_B_num_bits,
+        "iter": iter,
+        "info_B_num_bits": common_args.info_B_num_bits,
         "info_B_mask": common_args.info_B_mask,
     }
     dict_aux_float: Dict[str, float] = {
         "gwd_lower_bound": gwd_lower_bound,
     }
@@ -81,7 +82,7 @@ def invoke(
     # Explicitly pass only prev_iter_dev for global weight decay, unless it already exists in optim arg
     dict_aux_tensor["prev_iter_dev"] = prev_iter_dev
     # optimizer_args # if optimizer == none
     dict_aux_bool["gradient_clipping"] = optimizer_args.gradient_clipping
@@ -132,6 +133,11 @@ def invoke(
         "Please check the frontend and backend version. "
     )
     aux_tensor.append(dict_aux_tensor["prev_iter_dev"])
+    assert "vbe_output_offsets" in dict_aux_tensor, (
+        "vbe_output_offsets must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["vbe_output_offsets"])
     aux_int: List[int] = []
     assert "iter" in dict_aux_int, (
@@ -204,7 +210,7 @@ def invoke(
     # ['momentum1', 'learning_rate_tensor', 'optim_float']
     optim_float: List[float] = []
     optim_float.append(dict_optim_float["eps"])
-    # optim_bool
+    # optim_bool
     return torch.ops.fbgemm.split_embedding_codegen_lookup_adagrad_function_pt2(
         # common_args
@@ -226,6 +232,7 @@ def invoke(
         max_B=vbe_metadata.max_B,
         max_B_feature_rank=vbe_metadata.max_B_feature_rank,
         vbe_output_size=vbe_metadata.output_size,
+        vbe_output=vbe_metadata.vbe_output,
         # aux_tensor
         aux_tensor=aux_tensor,
         # aux_int

fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py CHANGED Viewed

@@ -67,14 +67,15 @@ def invoke(
         "vbe_B_offsets_rank_per_feature": vbe_metadata.B_offsets_rank_per_feature,
         "lxu_cache_locations": common_args.lxu_cache_locations,
         "uvm_cache_stats": common_args.uvm_cache_stats,
+        "vbe_output_offsets" : vbe_metadata.vbe_output_offsets,
     }
     dict_aux_int: Dict[str, int] = {
-        "iter": iter,
-        "info_B_num_bits": common_args.info_B_num_bits,
+        "iter": iter,
+        "info_B_num_bits": common_args.info_B_num_bits,
         "info_B_mask": common_args.info_B_mask,
     }
     dict_aux_float: Dict[str, float] = {
         "gwd_lower_bound": gwd_lower_bound,
     }
@@ -92,7 +93,7 @@ def invoke(
     # Explicitly pass only prev_iter_dev for global weight decay, unless it already exists in optim arg
     dict_aux_tensor["prev_iter_dev"] = prev_iter_dev
     # optimizer_args # if optimizer == none
     dict_aux_bool["gradient_clipping"] = optimizer_args.gradient_clipping
@@ -125,13 +126,13 @@ def invoke(
         momentum2.placements,
         momentum2.offsets,
     ] if momentum2 is not None else None
     if optimizer_args.use_rowwise_bias_correction and row_counter is not None:
         row_counter_host = None # not supported on CPU
         row_counter_dev = row_counter.dev
         row_counter_uvm = row_counter.uvm
         row_counter_offsets = row_counter.offsets
-        row_counter_placements = row_counter.placements
+        row_counter_placements = row_counter.placements
     elif optimizer_args.use_rowwise_bias_correction:
         assert False, "`use_rowwise_bias_correction` is set, `row_counter` cannot be None"
     else:
@@ -173,6 +174,11 @@ def invoke(
         "Please check the frontend and backend version. "
     )
     aux_tensor.append(dict_aux_tensor["prev_iter_dev"])
+    assert "vbe_output_offsets" in dict_aux_tensor, (
+        "vbe_output_offsets must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["vbe_output_offsets"])
     aux_int: List[int] = []
     assert "iter" in dict_aux_int, (
@@ -271,7 +277,7 @@ def invoke(
     optim_float.append(dict_optim_float["weight_decay"])
     # optim_bool
     optim_bool: List[bool] = []
-    optim_bool.append(dict_optim_bool["use_rowwise_bias_correction"])
+    optim_bool.append(dict_optim_bool["use_rowwise_bias_correction"])
     return torch.ops.fbgemm.split_embedding_codegen_lookup_adam_function_pt2(
         # common_args
@@ -293,6 +299,7 @@ def invoke(
         max_B=vbe_metadata.max_B,
         max_B_feature_rank=vbe_metadata.max_B_feature_rank,
         vbe_output_size=vbe_metadata.output_size,
+        vbe_output=vbe_metadata.vbe_output,
         # aux_tensor
         aux_tensor=aux_tensor,
         # aux_int

fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py CHANGED Viewed

@@ -27,6 +27,8 @@ class VBEMetadata(NamedTuple):
     max_B_feature_rank: int = -1
     max_B: int = -1
     output_size: int = -1
+    vbe_output: Optional[torch.Tensor] = None
+    vbe_output_offsets: Optional[torch.Tensor] = None
 class CommonArgs(NamedTuple):

fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py CHANGED Viewed

@@ -27,6 +27,8 @@ class VBEMetadata(NamedTuple):
     max_B_feature_rank: int = -1
     max_B: int = -1
     output_size: int = -1
+    vbe_output: Optional[torch.Tensor] = None
+    vbe_output_offsets: Optional[torch.Tensor] = None
 class CommonArgs(NamedTuple):