PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/split_embedding_configs.py CHANGED Viewed

@@ -9,10 +9,11 @@
 import enum
 import itertools
-from typing import Any, Dict, List, Optional, Tuple  # noqa: F401
+from typing import Any, Dict  # noqa: F401
 import torch
+# fmt:skip
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     EmbeddingLocation,
     SplitState,
@@ -36,6 +37,23 @@ def pad4(value: int) -> int:
     return (int(value) + 3) & ~3
+def pad16(value: int) -> int:
+    """
+    Compute the smallest multiple of 16 that is greater than or equal to the given value.
+    Parameters:
+        value (int): The integer to align (must be non-negative).
+    Returns:
+        int: The aligned value.
+    Raises:
+        ValueError: If the input is negative.
+        TypeError: If the input is not an integer.
+    """
+    return (int(value) + 15) & ~15
 @enum.unique
 class EmbOptimType(enum.Enum):
     SGD = "sgd"  # uses non-deterministic updates (atomicAdd(..)) with duplicate ids
@@ -64,13 +82,13 @@ class EmbOptimType(enum.Enum):
         return self.value
     def _extract_dtype(
-        self, optimizer_state_dtypes: Dict[str, "SparseType"], name: str
+        self, optimizer_state_dtypes: dict[str, "SparseType"], name: str
     ) -> torch.dtype:
         if optimizer_state_dtypes is None or name not in optimizer_state_dtypes:
             return torch.float32
         return optimizer_state_dtypes[name].as_dtype()
-    def state_names(self) -> List[str]:
+    def state_names(self) -> list[str]:
         """
         Returns the names of the optimizer states.  The order of the states will
         be the order in which they are processed and returned in
@@ -79,12 +97,12 @@ class EmbOptimType(enum.Enum):
         """
         if self == EmbOptimType.EXACT_ROWWISE_ADAGRAD:
             return ["momentum1"]
-        elif self == EmbOptimType.PARTIAL_ROWWISE_ADAM:
+        elif self in [EmbOptimType.PARTIAL_ROWWISE_ADAM, EmbOptimType.ADAM]:
             return ["momentum1", "momentum2"]
         else:
             return []
-    def state_size_table(self, D: int) -> Dict[str, int]:
+    def state_size_table(self, D: int) -> dict[str, int]:
         """
         Returns the table of state names to state sizes in terms of number of
         elements (per table row)
@@ -93,64 +111,84 @@ class EmbOptimType(enum.Enum):
             return {"momentum1": 1}
         elif self == EmbOptimType.PARTIAL_ROWWISE_ADAM:
             return {"momentum1": D, "momentum2": 1}
+        elif self == EmbOptimType.ADAM:
+            return {"momentum1": D, "momentum2": D}
         else:
             return {}
     def state_size_nbytes(
-        self, D: int, optimizer_state_dtypes: Dict[str, "SparseType"] = {}  # noqa: B006
+        self,
+        D: int,
+        optimizer_state_dtypes: dict[str, "SparseType"] = {},  # noqa: B006
     ) -> int:
         """
         Returns the size of the data (in bytes) required to hold the optimizer
-        state (per table row)
+        state (per table row).  This size includes byte-padding.
         """
-        return sum(
-            [
-                # For each state, multiply the number of elements by the byte
-                # size of each element
-                (self._extract_dtype(optimizer_state_dtypes, name).itemsize * elem)
-                for name, elem in self.state_size_table(D).items()
-            ]
-        )
+        momentum1_dtype = self._extract_dtype(optimizer_state_dtypes, "momentum1")
+        momentum2_dtype = self._extract_dtype(optimizer_state_dtypes, "momentum2")
+        if self == EmbOptimType.EXACT_ROWWISE_ADAGRAD:
+            return momentum1_dtype.itemsize
+        elif self == EmbOptimType.PARTIAL_ROWWISE_ADAM:
+            return pad4(1 * momentum2_dtype.itemsize) + D * momentum1_dtype.itemsize
+        elif self == EmbOptimType.ADAM:
+            return (D * momentum1_dtype.itemsize) + (D * momentum2_dtype.itemsize)
+        else:
+            return 0
     def byte_offsets_along_row(
         self,
         D: int,
         weights_precision: "SparseType",
-        optimizer_state_dtypes: Dict[str, "SparseType"] = {},  # noqa: B006
-    ) -> Dict[str, Tuple[int, int]]:
+        optimizer_state_dtypes: dict[str, "SparseType"] = {},  # noqa: B006
+    ) -> dict[str, tuple[int, int]]:
         """
         Returns the start and end byte offsets of each optimizer state along a
         cache row with optimizer state offloading enabled.
         """
+        # Extract the optimizer state dtypes
+        momentum1_dtype = self._extract_dtype(optimizer_state_dtypes, "momentum1")
+        momentum2_dtype = self._extract_dtype(optimizer_state_dtypes, "momentum2")
         # This is the pointer to where the optimizer state begins in the memory
         p0 = pad4(D) * weights_precision.as_dtype().itemsize
         if self == EmbOptimType.EXACT_ROWWISE_ADAGRAD:
-            momentum1_dtype = self._extract_dtype(optimizer_state_dtypes, "momentum1")
-            # Store one value for momentum per row
             return {"momentum1": (p0, p0 + momentum1_dtype.itemsize)}
         elif self == EmbOptimType.PARTIAL_ROWWISE_ADAM:
-            momentum1_dtype = self._extract_dtype(optimizer_state_dtypes, "momentum1")
-            momentum2_dtype = self._extract_dtype(optimizer_state_dtypes, "momentum2")
+            # momentum1 lies after momentum2
+            p1 = p0 + pad4(1 * momentum2_dtype.itemsize)
             return {
                 "momentum2": (p0, p0 + momentum2_dtype.itemsize),
                 "momentum1": (
-                    p0 + momentum2_dtype.itemsize,
-                    p0 + momentum2_dtype.itemsize + D * momentum1_dtype.itemsize,
+                    p1,
+                    p1 + D * momentum1_dtype.itemsize,
                 ),
             }
+        elif self == EmbOptimType.ADAM:
+            # momentum2 lies after momentum1
+            p1 = p0 + (D * momentum1_dtype.itemsize)
+            return {
+                "momentum1": (p0, p1),
+                "momentum2": (p1, p1 + D * momentum2_dtype.itemsize),
+            }
         else:
             return {}
     def empty_states(
         self,
-        rows: List[int],
-        dims: List[int],
-        optimizer_state_dtypes: Dict[str, "SparseType"] = {},  # noqa: B006
-    ) -> List[List[torch.Tensor]]:
+        rows: list[int],
+        dims: list[int],
+        optimizer_state_dtypes: dict[str, "SparseType"] = {},  # noqa: B006
+    ) -> list[list[torch.Tensor]]:
         """
         Creates sets of empty tensors per table to hold optimizer states based
         on the specified optimizer type, state dtypes, embedding specs, and
@@ -159,7 +197,7 @@ class EmbOptimType(enum.Enum):
         # Else, check that the local row count for each table is set
         assert len(rows) == len(dims)
-        opt_states_set: List[List[torch.Tensor]] = []
+        opt_states_set: list[list[torch.Tensor]] = []
         for r, D in zip(rows, dims):
             # Set up the table of state names to state sizes, ordered by their
@@ -186,20 +224,20 @@ class EmbOptimType(enum.Enum):
     def ssd_state_splits(
         self,
-        embedding_specs: List[Tuple[int, int]],  # Tuple of (rows, dims)
-        optimizer_state_dtypes: Dict[str, "SparseType"] = {},  # noqa: B006
+        embedding_specs: list[tuple[int, int]],  # Tuple of (rows, dims)
+        optimizer_state_dtypes: dict[str, "SparseType"] = {},  # noqa: B006
         enable_optimizer_offloading: bool = False,
-    ) -> List[Tuple[SplitState, str, torch.dtype]]:
+    ) -> list[tuple[SplitState, str, torch.dtype]]:
         """
         Returns the split planning for the optimizer states
         """
-        (rows, _) = zip(*embedding_specs)
+        rows, _ = zip(*embedding_specs)
         T_ = len(embedding_specs)
         # This is the cumulative row counts for rowwise states
-        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows))
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
         # This is the cumulative element counts for elementwise states
-        table_size_cumsum: List[int] = [0] + list(
+        table_size_cumsum: list[int] = [0] + list(
             itertools.accumulate([r * d for r, d in embedding_specs])
         )
@@ -207,6 +245,12 @@ class EmbOptimType(enum.Enum):
             params = {"momentum1": row_count_cumsum}
         elif self == EmbOptimType.PARTIAL_ROWWISE_ADAM:
             params = {"momentum1": table_size_cumsum, "momentum2": row_count_cumsum}
+        elif self == EmbOptimType.ADAM:
+            params = {
+                "momentum1": table_size_cumsum,
+                "momentum2": table_size_cumsum,
+                "row_counter": row_count_cumsum,
+            }
         else:
             params = {}
@@ -266,14 +310,54 @@ def sparse_type_to_int(sparse_type: "SparseType") -> int:
         SparseType.BF16.value: 5,
         SparseType.FP8.value: 6,
         SparseType.MX4.value: 7,
+        SparseType.NFP8.value: 8,
     }[sparse_type.value]
+def sparse_type_int_to_dtype(ty: int) -> torch.dtype:
+    """
+    TorchScript-compatible function to convert an SparseType enum as integer) to torch.dtype.
+    This is a standalone function equivalent to SparseType.from_int(dtype_int).as_dtype() that works
+    with TorchScript. TorchScript does not support @staticmethod on Enum classes,
+    so this function provides a workaround.
+    """
+    if ty == 0:  # fp32
+        return torch.float32
+    elif ty == 1:  # fp16
+        return torch.float16
+    elif ty == 2:  # int8
+        return torch.uint8
+    elif ty == 3:  # int4
+        return torch.quint4x2
+    elif ty == 4:  # int2
+        return torch.quint2x4
+    elif ty == 5:  # bf16
+        return torch.bfloat16
+    elif ty == 6:  # fp8
+        return torch.uint8
+    elif ty == 7:  # mx4
+        return torch.uint8
+    elif ty == 9:
+        return (
+            torch.float8_e4m3fnuz
+            if torch.version.hip is not None
+            else torch.float8_e4m3fn
+        )
+    else:  # Invalid is 7 or non enumerated.
+        raise ValueError(f"Unsupported sparse type: {ty}")
 @enum.unique
 class SparseType(enum.Enum):
     FP32 = "fp32"
     FP16 = "fp16"
     FP8 = "fp8"
+    # NFP8 refers to "native" FP8 in that it uses the GPU implementations
+    # of E4M3 whereas the other FP8 sparsetype uses a custom format. Use of
+    # NFP8 allows us to use hardware casting intrinsics which can be much faster.
+    # Eventually, we should merge these two types.
+    NFP8 = "nfp8"
     INT8 = "int8"
     INT4 = "int4"
     INT2 = "int2"
@@ -299,9 +383,11 @@ class SparseType(enum.Enum):
             return SparseType("bf16")
         elif ty == 6:
             return SparseType("fp8")
-        elif ty == 7:
+        elif ty == 8:
             return SparseType("mx4")
-        else:
+        elif ty == 9:
+            return SparseType("nfp8")
+        else:  # Invalid is 7 or non enumerated.
             raise ValueError(f"Unsupported sparse type: {ty}")
     def as_int(self) -> int:
@@ -323,6 +409,8 @@ class SparseType(enum.Enum):
             return SparseType("bf16")
         elif dtype == torch.uint8:
             return SparseType("mx4")
+        elif dtype == torch.float8_e4m3fnuz or dtype == torch.float8_e4m3fn:
+            return SparseType("nfp8")
         else:
             raise ValueError(f"Unsupported sparse dtype: {dtype}")
@@ -336,6 +424,11 @@ class SparseType(enum.Enum):
             SparseType.INT2.value: torch.quint2x4,
             SparseType.BF16.value: torch.bfloat16,
             SparseType.MX4.value: torch.uint8,
+            SparseType.NFP8.value: (
+                torch.float8_e4m3fnuz
+                if torch.version.hip is not None
+                else torch.float8_e4m3fn
+            ),
         }[self.value]
     def bit_rate(self) -> int:
@@ -348,6 +441,7 @@ class SparseType(enum.Enum):
             SparseType.INT2.value: 2,
             SparseType.BF16.value: 16,
             SparseType.MX4.value: 4,
+            SparseType.NFP8.value: 8,
         }[self.value]
     def align_size(self) -> int:
@@ -360,6 +454,7 @@ class SparseType(enum.Enum):
             SparseType.INT2.value: 16,
             SparseType.BF16.value: 2,
             SparseType.MX4.value: 8,
+            SparseType.NFP8.value: 4,
         }[self.value]
     def is_float(self) -> bool:
@@ -368,6 +463,7 @@ class SparseType(enum.Enum):
             or self.value == SparseType.FP16.value
             or self.value == SparseType.FP8.value
             or self.value == SparseType.BF16.value
+            or self.value == SparseType.NFP8.value
         ):
             return True
         else:
@@ -380,11 +476,12 @@ class SparseType(enum.Enum):
             return QuantizationConfig()
-ELEMENT_SIZE: Dict[SparseType, int] = {
+ELEMENT_SIZE: dict[SparseType, int] = {
     SparseType.FP32: 4,
     SparseType.FP16: 2,
     SparseType.FP8: 1,
     SparseType.INT8: 1,
     SparseType.BF16: 2,
+    SparseType.NFP8: 1,
     # SparseType.INT4: 0.5,
 }

fbgemm_gpu/split_embedding_inference_converter.py CHANGED Viewed

@@ -10,10 +10,11 @@
 import logging
 import math
-from typing import cast, Optional, Tuple
+from typing import cast, Optional
 import torch
+# fmt:skip
 from fbgemm_gpu.split_embedding_configs import (
     FP8QuantizationConfig,
     QuantizationConfig,
@@ -53,7 +54,7 @@ class SplitEmbInferenceConverter:
         return model
     # pyre-fixme[2]: Parameter must be annotated.
-    def _prune_by_weights_l2_norm(self, new_num_rows, weights) -> Tuple[Tensor, float]:
+    def _prune_by_weights_l2_norm(self, new_num_rows, weights) -> tuple[Tensor, float]:
         assert new_num_rows > 0
         from numpy.linalg import norm
@@ -75,7 +76,7 @@ class SplitEmbInferenceConverter:
         idx: int,
         num_rows: int,
         module: SplitTableBatchedEmbeddingBagsCodegen,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
+    ) -> tuple[Tensor, Optional[Tensor]]:
         # TODO(yingz): Avoid DtoH / HtoD overhead.
         weights = module.split_embedding_weights()[idx].cpu()
         if self.pruning_ratio is None:
@@ -84,7 +85,7 @@ class SplitEmbInferenceConverter:
         if new_num_rows == num_rows:
             return (weights, None)
-        (indicators, threshold) = self._prune_by_weights_l2_norm(new_num_rows, weights)
+        indicators, threshold = self._prune_by_weights_l2_norm(new_num_rows, weights)
         return torch.ops.fbgemm.embedding_bag_rowwise_prune(
             weights, indicators, threshold, torch.int32
@@ -100,7 +101,7 @@ class SplitEmbInferenceConverter:
     def _quantize_embs(
         self, weight: Tensor, weight_ty: SparseType
-    ) -> Tuple[Tensor, Optional[Tensor]]:
+    ) -> tuple[Tensor, Optional[Tensor]]:
         fp8_quant_config = cast(FP8QuantizationConfig, self.quantization_config)
         return quantize_embs(weight, weight_ty, fp8_quant_config)
@@ -129,7 +130,7 @@ class SplitEmbInferenceConverter:
                 index_remapping_list = []
                 for t, (_, E, D, weight_ty) in enumerate(embedding_specs):
                     # Try to prune embeddings.
-                    (pruned_weight, index_remapping) = self._prune_embs(t, E, child)
+                    pruned_weight, index_remapping = self._prune_embs(t, E, child)
                     new_embedding_specs.append(
                         (
                             "",

fbgemm_gpu/split_table_batched_embeddings_ops_common.py CHANGED Viewed

@@ -11,12 +11,11 @@
 import enum
 from dataclasses import dataclass
-from typing import List, NamedTuple, Optional, Tuple
+from typing import FrozenSet, NamedTuple, Optional, Tuple
 import torch
 from torch import Tensor
 # Maximum number of times prefetch() can be called without
 # a corresponding forward() call
 MAX_PREFETCH_DEPTH = 100
@@ -62,10 +61,10 @@ class EmbeddingLocation(enum.IntEnum):
 class EvictionPolicy(NamedTuple):
     eviction_trigger_mode: int = (
-        0  # disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
+        0  # disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual 4: id count
     )
     eviction_strategy: int = (
-        0  # 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
+        0  # 0: timestamp, 1: counter , 2: counter + timestamp, 3: feature l2 norm 4: timestamp threshold 5: feature score
     )
     eviction_step_intervals: Optional[int] = (
         None  # trigger_step_interval if trigger mode is iteration
@@ -73,18 +72,33 @@ class EvictionPolicy(NamedTuple):
     eviction_mem_threshold_gb: Optional[int] = (
         None  # eviction trigger condition if trigger mode is mem_util
     )
-    counter_thresholds: Optional[List[int]] = (
-        None  # count_thresholds for each table if eviction strategy is feature score
+    counter_thresholds: Optional[list[int]] = (
+        None  # count_thresholds for each table if eviction strategy is counter
     )
-    ttls_in_mins: Optional[List[int]] = (
+    ttls_in_mins: Optional[list[int]] = (
         None  # ttls_in_mins for each table if eviction strategy is timestamp
     )
-    counter_decay_rates: Optional[List[float]] = (
-        None  # count_decay_rates for each table if eviction strategy is feature score
+    counter_decay_rates: Optional[list[float]] = (
+        None  # count_decay_rates for each table if eviction strategy is counter
+    )
+    feature_score_counter_decay_rates: Optional[list[float]] = (
+        None  # feature_score_counter_decay_rates for each table if eviction strategy is feature score
+    )
+    training_id_eviction_trigger_count: Optional[list[int]] = (
+        None  # Number of training IDs that, when exceeded, will trigger eviction for each table.
     )
-    l2_weight_thresholds: Optional[List[float]] = (
+    training_id_keep_count: Optional[list[int]] = (
+        None  # Target number of training IDs to retain in each table after eviction.
+    )
+    l2_weight_thresholds: Optional[list[float]] = (
         None  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
     )
+    threshold_calculation_bucket_stride: Optional[float] = (
+        0.2  # The width of each feature score bucket used for threshold calculation in feature score-based eviction.
+    )
+    threshold_calculation_bucket_num: Optional[int] = (
+        1000000  # 1M, Total number of feature score buckets used for threshold calculation in feature score-based eviction.
+    )
     interval_for_insufficient_eviction_s: int = (
         # wait at least # seconds before trigger next round of eviction, if last finished eviction is insufficient
         # insufficient means we didn't evict enough rows, so we want to wait longer time to
@@ -95,18 +109,30 @@ class EvictionPolicy(NamedTuple):
         # wait at least # seconds before trigger next round of eviction, if last finished eviction is sufficient
         60
     )
-    meta_header_lens: Optional[List[int]] = None  # metaheader length for each table
+    interval_for_feature_statistics_decay_s: int = (
+        24 * 3600  # 1 day, interval for feature statistics decay
+    )
+    meta_header_lens: Optional[list[int]] = None  # metaheader length for each table
+    eviction_free_mem_threshold_gb: Optional[int] = (
+        None  # Minimum free memory (in GB) required before triggering eviction when using free_mem trigger mode.
+    )
+    eviction_free_mem_check_interval_batch: Optional[int] = (
+        None  # Number of batches between checks for free memory threshold when using free_mem trigger mode.
+    )
+    enable_eviction_for_feature_score_eviction_policy: Optional[list[bool]] = (
+        None  # enable eviction if eviction policy is feature score, false means no eviction
+    )
     def validate(self) -> None:
-        assert self.eviction_trigger_mode in [0, 1, 2, 3], (
-            "eviction_trigger_mode must be 0, 1, 2, or 3, "
+        assert self.eviction_trigger_mode in [0, 1, 2, 3, 4, 5], (
+            "eviction_trigger_mode must be 0, 1, 2, 3, 4, 5"
             f"actual {self.eviction_trigger_mode}"
         )
         if self.eviction_trigger_mode == 0:
             return
-        assert self.eviction_strategy in [0, 1, 2, 3], (
-            "eviction_strategy must be 0, 1, 2, or 3, "
+        assert self.eviction_strategy in [0, 1, 2, 3, 4, 5], (
+            "eviction_strategy must be 0, 1, 2, 3, 4 or 5, "
             f"actual {self.eviction_strategy}"
         )
         if self.eviction_trigger_mode == 1:
@@ -121,6 +147,17 @@ class EvictionPolicy(NamedTuple):
             assert (
                 self.eviction_mem_threshold_gb is not None
             ), "eviction_mem_threshold_gb must be set if eviction_trigger_mode is 2"
+        elif self.eviction_trigger_mode == 4:
+            assert (
+                self.training_id_eviction_trigger_count is not None
+            ), "training_id_eviction_trigger_count must be set if eviction_trigger_mode is 4"
+        elif self.eviction_trigger_mode == 5:
+            assert (
+                self.eviction_free_mem_threshold_gb is not None
+            ), "eviction_free_mem_threshold_gb must be set if eviction_trigger_mode is 5"
+            assert (
+                self.eviction_free_mem_check_interval_batch is not None
+            ), "eviction_free_mem_check_interval_batch must be set if eviction_trigger_mode is 5"
         if self.eviction_strategy == 0:
             assert self.ttls_in_mins is not None, (
@@ -161,21 +198,58 @@ class EvictionPolicy(NamedTuple):
                 "counter_thresholds and ttls_in_mins must have the same length, "
                 f"actual {self.counter_thresholds} vs {self.ttls_in_mins}"
             )
+        elif self.eviction_strategy == 5:
+            assert self.feature_score_counter_decay_rates is not None, (
+                "feature_score_counter_decay_rates must be set if eviction_strategy is 5, "
+                f"actual {self.feature_score_counter_decay_rates}"
+            )
+            assert self.training_id_eviction_trigger_count is not None, (
+                "training_id_eviction_trigger_count must be set if eviction_strategy is 5,"
+                f"actual {self.training_id_eviction_trigger_count}"
+            )
+            assert self.training_id_keep_count is not None, (
+                "training_id_keep_count must be set if eviction_strategy is 5,"
+                f"actual {self.training_id_keep_count}"
+            )
+            assert self.threshold_calculation_bucket_stride is not None, (
+                "threshold_calculation_bucket_stride must be set if eviction_strategy is 5,"
+                f"actual {self.threshold_calculation_bucket_stride}"
+            )
+            assert self.threshold_calculation_bucket_num is not None, (
+                "threshold_calculation_bucket_num must be set if eviction_strategy is 5,"
+                f"actual {self.threshold_calculation_bucket_num}"
+            )
+            assert self.enable_eviction_for_feature_score_eviction_policy is not None, (
+                "enable_eviction_for_feature_score_eviction_policy must be set if eviction_strategy is 5,"
+                f"actual {self.enable_eviction_for_feature_score_eviction_policy}"
+            )
+            assert (
+                len(self.enable_eviction_for_feature_score_eviction_policy)
+                == len(self.training_id_keep_count)
+                == len(self.feature_score_counter_decay_rates)
+            ), (
+                "feature_score_thresholds, enable_eviction_for_feature_score_eviction_policy, and training_id_keep_count must have the same length, "
+                f"actual {self.training_id_keep_count} vs {self.feature_score_counter_decay_rates} vs {self.enable_eviction_for_feature_score_eviction_policy}"
+            )
 class KVZCHParams(NamedTuple):
     # global bucket id start and global bucket id end offsets for each logical table,
     # where start offset is inclusive and end offset is exclusive
-    bucket_offsets: List[Tuple[int, int]] = []
+    bucket_offsets: list[tuple[int, int]] = []
     # bucket size for each logical table
     # the value indicates corresponding input space for each bucket id, e.g. 2^50 / total_num_buckets
-    bucket_sizes: List[int] = []
+    bucket_sizes: list[int] = []
     # enable optimizer offloading or not
     enable_optimizer_offloading: bool = False
     # when enabled, backend will return whole row(metaheader + weight + optimizer) instead of weight only
     # can only be enabled when enable_optimizer_offloading is enabled
     backend_return_whole_row: bool = False
     eviction_policy: EvictionPolicy = EvictionPolicy()
+    embedding_cache_mode: bool = False
+    load_ckpt_without_opt: bool = False
+    optimizer_type_for_st: Optional[str] = None
+    optimizer_state_dtypes_for_st: Optional[FrozenSet[Tuple[str, int]]] = None
     def validate(self) -> None:
         assert len(self.bucket_offsets) == len(self.bucket_sizes), (
@@ -188,6 +262,25 @@ class KVZCHParams(NamedTuple):
         ), "backend_return_whole_row can only be enabled when enable_optimizer_offloading is enabled"
+class KVZCHTBEConfig(NamedTuple):
+    # Eviction trigger model for kvzch table: 0: disabled, 1: iteration, 2: mem_util, 3: manual, 4: id count, 5: free_mem
+    kvzch_eviction_trigger_mode: int = 2  # mem_util
+    # Minimum free memory (in GB) required before triggering eviction when using free_mem trigger mode.
+    eviction_free_mem_threshold_gb: int = 200  # 200GB
+    # Number of batches between checks for free memory threshold when using free_mem trigger mode.
+    eviction_free_mem_check_interval_batch: int = 1000
+    # The width of each feature score bucket used for threshold calculation in feature score-based eviction.
+    threshold_calculation_bucket_stride: float = 0.2
+    # Total number of feature score buckets used for threshold calculation in feature score-based eviction.
+    threshold_calculation_bucket_num: Optional[int] = 1000000  # 1M
+    # When true, we only save weight to kvzch backend and not optimizer state.
+    load_ckpt_without_opt: bool = False
+    # [DO NOT USE] This is for st publish only, do not set it in your config
+    optimizer_type_for_st: Optional[str] = None
+    # [DO NOT USE] This is for st publish only, do not set it in your config
+    optimizer_state_dtypes_for_st: Optional[FrozenSet[Tuple[str, int]]] = None
 class BackendType(enum.IntEnum):
     SSD = 0
     DRAM = 1
@@ -288,8 +381,8 @@ SplitState: NamedTuple = NamedTuple(
         ("dev_size", int),
         ("host_size", int),
         ("uvm_size", int),
-        ("placements", List[EmbeddingLocation]),
-        ("offsets", List[int]),
+        ("placements", list[EmbeddingLocation]),
+        ("offsets", list[int]),
     ],
 )
@@ -297,15 +390,15 @@ SplitState: NamedTuple = NamedTuple(
 @dataclass
 class CacheState:
     # T + 1 elements and cache_hash_size_cumsum[-1] == total_cache_hash_size
-    cache_hash_size_cumsum: List[int]
-    cache_index_table_map: List[int]
+    cache_hash_size_cumsum: list[int]
+    cache_index_table_map: list[int]
     total_cache_hash_size: int
 def construct_cache_state(
-    row_list: List[int],
-    location_list: List[EmbeddingLocation],
-    feature_table_map: List[int],
+    row_list: list[int],
+    location_list: list[EmbeddingLocation],
+    feature_table_map: list[int],
 ) -> CacheState:
     _cache_hash_size_cumsum = [0]
     total_cache_hash_size = 0