PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/ssd/training.py CHANGED Viewed

@@ -12,13 +12,15 @@ import contextlib
 import functools
 import itertools
 import logging
+import math
 import os
-import tempfile
 import threading
 import time
+from functools import cached_property
 from math import floor, log2
-from typing import Any, Callable, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, ClassVar, Optional, Union
 import torch  # usort:skip
+import weakref
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
@@ -29,9 +31,13 @@ from fbgemm_gpu.runtime_monitor import (
 )
 from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType, SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
+    BackendType,
     BoundsCheckMode,
     CacheAlgorithm,
     EmbeddingLocation,
+    EvictionPolicy,
+    get_bounds_check_version_for_platform,
+    KVZCHParams,
     PoolingMode,
     SplitState,
 )
@@ -39,21 +45,23 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
     apply_split_helper,
     CounterBasedRegularizationDefinition,
     CowClipDefinition,
+    RESParams,
     UVMCacheStatsIndex,
     WeightDecayMode,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
+    check_allocated_vbe_output,
     generate_vbe_metadata,
+    is_torchdynamo_compiling,
 )
 from torch import distributed as dist, nn, Tensor  # usort:skip
+import sys
 from dataclasses import dataclass
 from torch.autograd.profiler import record_function
 from ..cache import get_unique_indices_v2
-from .common import ASSOC
+from .common import ASSOC, pad4, tensor_pad4
 from .utils.partially_materialized_tensor import PartiallyMaterializedTensor
@@ -69,6 +77,14 @@ class IterData:
     max_B: Optional[int] = -1
+@dataclass
+class KVZCHCachedData:
+    cached_optimizer_states_per_table: list[list[torch.Tensor]]
+    cached_weight_tensor_per_table: list[torch.Tensor]
+    cached_id_tensor_per_table: list[torch.Tensor]
+    cached_bucket_splits: list[torch.Tensor]
 class SSDTableBatchedEmbeddingBags(nn.Module):
     D_offsets: Tensor
     lxu_cache_weights: Tensor
@@ -86,12 +102,19 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     weights_placements: Tensor
     weights_offsets: Tensor
     _local_instance_index: int = -1
+    res_params: RESParams
+    table_names: list[str]
+    _all_tbe_instances: ClassVar[weakref.WeakSet] = weakref.WeakSet()
+    _first_instance_ref: ClassVar[weakref.ref] = None
+    _eviction_triggered: ClassVar[bool] = False
     def __init__(
         self,
-        embedding_specs: List[Tuple[int, int]],  # tuple of (rows, dims)
-        feature_table_map: Optional[List[int]],  # [T]
+        embedding_specs: list[tuple[int, int]],  # tuple of (rows, dims)
+        feature_table_map: Optional[list[int]],  # [T]
         cache_sets: int,
+        # A comma-separated string, e.g. "/data00_nvidia0,/data01_nvidia0/", db shards
+        # will be placed in these paths round-robin.
         ssd_storage_directory: str,
         ssd_rocksdb_shards: int = 1,
         ssd_memtable_flush_period: int = -1,
@@ -131,13 +154,16 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         pooling_mode: PoolingMode = PoolingMode.SUM,
         bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
         # Parameter Server Configs
-        ps_hosts: Optional[Tuple[Tuple[str, int]]] = None,
+        ps_hosts: Optional[tuple[tuple[str, int]]] = None,
         ps_max_key_per_request: Optional[int] = None,
         ps_client_thread_num: Optional[int] = None,
         ps_max_local_index_length: Optional[int] = None,
         tbe_unique_id: int = -1,
-        # in local test we need to use the pass in path for rocksdb creation
-        # in production we need to do it inside SSD mount path which will ignores the passed in path
+        # If set to True, will use `ssd_storage_directory` as the ssd paths.
+        # If set to False, will use the default ssd paths.
+        # In local test we need to use the pass in path for rocksdb creation
+        # fn production we could either use the default ssd mount points or explicity specify ssd
+        # mount points using `ssd_storage_directory`.
         use_passed_in_path: int = True,
         gather_ssd_cache_stats: Optional[bool] = False,
         stats_reporter_config: Optional[TBEStatsReporterConfig] = None,
@@ -152,19 +178,126 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         # number of rows will be decided by bulk_init_chunk_size / size_of_each_row
         bulk_init_chunk_size: int = 0,
         lazy_bulk_init_enabled: bool = False,
+        backend_type: BackendType = BackendType.SSD,
+        kv_zch_params: Optional[KVZCHParams] = None,
+        enable_raw_embedding_streaming: bool = False,  # whether enable raw embedding streaming
+        res_params: Optional[RESParams] = None,  # raw embedding streaming sharding info
+        flushing_block_size: int = 2_000_000_000,  # 2GB
+        table_names: Optional[list[str]] = None,
+        use_rowwise_bias_correction: bool = False,  # For Adam use
+        optimizer_state_dtypes: dict[str, SparseType] = {},  # noqa: B006
+        pg: Optional[dist.ProcessGroup] = None,
     ) -> None:
         super(SSDTableBatchedEmbeddingBags, self).__init__()
+        # Set the optimizer
+        assert optimizer in (
+            OptimType.EXACT_ROWWISE_ADAGRAD,
+            OptimType.PARTIAL_ROWWISE_ADAM,
+            OptimType.ADAM,
+        ), f"Optimizer {optimizer} is not supported by SSDTableBatchedEmbeddingBags"
+        self.optimizer = optimizer
+        # Set the table weight and output dtypes
+        assert weights_precision in (SparseType.FP32, SparseType.FP16)
+        self.weights_precision = weights_precision
+        self.output_dtype: int = output_dtype.as_int()
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            # Adagrad currently only supports FP32 for momentum1
+            self.optimizer_state_dtypes: dict[str, SparseType] = {
+                "momentum1": SparseType.FP32,
+            }
+        else:
+            self.optimizer_state_dtypes: dict[str, SparseType] = optimizer_state_dtypes
+        # Zero collision TBE configurations
+        self.kv_zch_params = kv_zch_params
+        self.backend_type = backend_type
+        self.enable_optimizer_offloading: bool = False
+        self.backend_return_whole_row: bool = False
+        self._embedding_cache_mode: bool = False
+        self.load_ckpt_without_opt: bool = False
+        if self.kv_zch_params:
+            self.kv_zch_params.validate()
+            self.load_ckpt_without_opt = (
+                # pyre-ignore [16]
+                self.kv_zch_params.load_ckpt_without_opt
+            )
+            self.enable_optimizer_offloading = (
+                # pyre-ignore [16]
+                self.kv_zch_params.enable_optimizer_offloading
+            )
+            self.backend_return_whole_row = (
+                # pyre-ignore [16]
+                self.kv_zch_params.backend_return_whole_row
+            )
+            if self.enable_optimizer_offloading:
+                logging.info("Optimizer state offloading is enabled")
+            if self.backend_return_whole_row:
+                assert (
+                    self.backend_type == BackendType.DRAM
+                ), f"Only DRAM backend supports backend_return_whole_row, but got {self.backend_type}"
+                logging.info(
+                    "Backend will return whole row including metaheader, weight and optimizer for checkpoint"
+                )
+            # pyre-ignore [16]
+            self._embedding_cache_mode = self.kv_zch_params.embedding_cache_mode
+            if self._embedding_cache_mode:
+                logging.info("KVZCH is in embedding_cache_mode")
+                assert self.optimizer in [
+                    OptimType.EXACT_ROWWISE_ADAGRAD
+                ], f"only EXACT_ROWWISE_ADAGRAD supports embedding cache mode, but got {self.optimizer}"
+            if self.load_ckpt_without_opt:
+                if (
+                    # pyre-ignore [16]
+                    self.kv_zch_params.optimizer_type_for_st
+                    == OptimType.PARTIAL_ROWWISE_ADAM.value
+                ):
+                    self.optimizer = OptimType.PARTIAL_ROWWISE_ADAM
+                    logging.info(
+                        f"Override optimizer type with {self.optimizer=} for st publish"
+                    )
+                if (
+                    # pyre-ignore [16]
+                    self.kv_zch_params.optimizer_state_dtypes_for_st
+                    is not None
+                ):
+                    optimizer_state_dtypes = {}
+                    for k, v in dict(
+                        self.kv_zch_params.optimizer_state_dtypes_for_st
+                    ).items():
+                        optimizer_state_dtypes[k] = SparseType.from_int(v)
+                    self.optimizer_state_dtypes = optimizer_state_dtypes
+                    logging.info(
+                        f"Override optimizer_state_dtypes with {self.optimizer_state_dtypes=} for st publish"
+                    )
         self.pooling_mode = pooling_mode
         self.bounds_check_mode_int: int = bounds_check_mode.value
         self.embedding_specs = embedding_specs
-        (rows, dims) = zip(*embedding_specs)
+        self.table_names = table_names if table_names is not None else []
+        rows, dims = zip(*embedding_specs)
         T_ = len(self.embedding_specs)
         assert T_ > 0
         # pyre-fixme[8]: Attribute has type `device`; used as `int`.
         self.current_device: torch.device = torch.cuda.current_device()
-        self.feature_table_map: List[int] = (
+        self.enable_raw_embedding_streaming = enable_raw_embedding_streaming
+        # initialize the raw embedding streaming related variables
+        self.res_params: RESParams = res_params or RESParams()
+        if self.enable_raw_embedding_streaming:
+            self.res_params.table_sizes = [0] + list(itertools.accumulate(rows))
+            res_port_from_env = os.getenv("LOCAL_RES_PORT")
+            self.res_params.res_server_port = (
+                int(res_port_from_env) if res_port_from_env else 0
+            )
+            logging.info(
+                f"get env {self.res_params.res_server_port=}, at rank {dist.get_rank()}, with {self.res_params=}"
+            )
+        self.feature_table_map: list[int] = (
             feature_table_map if feature_table_map is not None else list(range(T_))
         )
         T = len(self.feature_table_map)
@@ -177,7 +310,11 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         feature_dims = [dims[t] for t in self.feature_table_map]
         D_offsets = [dims[t] for t in self.feature_table_map]
         D_offsets = [0] + list(itertools.accumulate(D_offsets))
+        # Sum of row length of all tables
         self.total_D: int = D_offsets[-1]
+        # Max number of elements required to store a row in the cache
         self.max_D: int = max(dims)
         self.register_buffer(
             "D_offsets",
@@ -189,6 +326,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.total_hash_size_bits: int = 0
         else:
             self.total_hash_size_bits: int = int(log2(float(hash_size_cumsum[-1])) + 1)
+        self.register_buffer(
+            "table_hash_size_cumsum",
+            torch.tensor(
+                hash_size_cumsum, device=self.current_device, dtype=torch.int64
+            ),
+        )
         # The last element is to easily access # of rows of each table by
         self.total_hash_size_bits = int(log2(float(hash_size_cumsum[-1])) + 1)
         self.total_hash_size: int = hash_size_cumsum[-1]
@@ -229,13 +372,25 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             "feature_dims",
             torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
         )
+        self.register_buffer(
+            "table_dims",
+            torch.tensor(dims, device="cpu", dtype=torch.int64),
+        )
+        info_B_num_bits_, info_B_mask_ = torch.ops.fbgemm.get_infos_metadata(
+            self.D_offsets,  # unused tensor
+            1,  # max_B
+            T,  # T
+        )
+        self.info_B_num_bits: int = info_B_num_bits_
+        self.info_B_mask: int = info_B_mask_
         assert cache_sets > 0
         element_size = weights_precision.bit_rate() // 8
         assert (
             element_size == 4 or element_size == 2
         ), f"Invalid element size {element_size}"
-        cache_size = cache_sets * ASSOC * element_size * self.max_D
+        cache_size = cache_sets * ASSOC * element_size * self.cache_row_dim
         logging.info(
             f"Using cache for SSD with admission algorithm "
             f"{CacheAlgorithm.LRU}, {cache_sets} sets, stored on {'DEVICE' if ssd_cache_location is EmbeddingLocation.DEVICE else 'MANAGED'} with {ssd_rocksdb_shards} shards, "
@@ -243,10 +398,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             f"Memtable Flush Period: {ssd_memtable_flush_period}, "
             f"Memtable Flush Offset: {ssd_memtable_flush_offset}, "
             f"Desired L0 files per compaction: {ssd_l0_files_per_compact}, "
-            f"{cache_size / 1024.0 / 1024.0 / 1024.0 : .2f}GB, "
+            f"Cache size: {cache_size / 1024.0 / 1024.0 / 1024.0 : .2f}GB, "
             f"weights precision: {weights_precision}, "
             f"output dtype: {output_dtype}, "
-            f"chunk size in bulk init: {bulk_init_chunk_size} bytes"
+            f"chunk size in bulk init: {bulk_init_chunk_size} bytes, backend_type: {backend_type}, "
+            f"kv_zch_params: {kv_zch_params}, "
+            f"embedding spec: {embedding_specs}"
         )
         self.register_buffer(
             "lxu_cache_state",
@@ -262,6 +419,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         )
         self.step = 0
+        self.last_flush_step = -1
         # Set prefetch pipeline
         self.prefetch_pipeline: bool = prefetch_pipeline
@@ -291,10 +449,6 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             EmbeddingLocation.DEVICE,
         )
-        assert weights_precision in (SparseType.FP32, SparseType.FP16)
-        self.weights_precision = weights_precision
-        self.output_dtype: int = output_dtype.as_int()
         cache_dtype = weights_precision.as_dtype()
         if ssd_cache_location == EmbeddingLocation.MANAGED:
             self.register_buffer(
@@ -305,7 +459,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                         device=self.current_device,
                         dtype=cache_dtype,
                     ),
-                    [cache_sets * ASSOC, self.max_D],
+                    [cache_sets * ASSOC, self.cache_row_dim],
                     is_host_mapped=self.uvm_host_mapped,
                 ),
             )
@@ -314,7 +468,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 "lxu_cache_weights",
                 torch.zeros(
                     cache_sets * ASSOC,
-                    self.max_D,
+                    self.cache_row_dim,
                     device=self.current_device,
                     dtype=cache_dtype,
                 ),
@@ -387,6 +541,15 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.timestep = 0
+        # Store the iteration number on GPU and CPU (used for certain optimizers)
+        persistent_iter_ = optimizer in (OptimType.PARTIAL_ROWWISE_ADAM,)
+        self.register_buffer(
+            "iter",
+            torch.zeros(1, dtype=torch.int64, device=self.current_device),
+            persistent=persistent_iter_,
+        )
+        self.iter_cpu: torch.Tensor = torch.zeros(1, dtype=torch.int64, device="cpu")
         # Dummy profile configuration for measuring the SSD get/set time
         # get and set are executed by another thread which (for some reason) is
         # not traceable by PyTorch's Kineto. We workaround this problem by
@@ -405,18 +568,46 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 f"FBGEMM_SSD_TBE_USE_DUMMY_PROFILE is set to {set_dummy_profile}; "
                 f"Use dummy profile: {use_dummy_profile}"
             )
-        # pyre-ignore[4]
         self.record_function_via_dummy_profile: Callable[..., Any] = (
             self.record_function_via_dummy_profile_factory(use_dummy_profile)
         )
-        os.makedirs(ssd_storage_directory, exist_ok=True)
+        if use_passed_in_path:
+            ssd_dir_list = ssd_storage_directory.split(",")
+            for ssd_dir in ssd_dir_list:
+                os.makedirs(ssd_dir, exist_ok=True)
-        ssd_directory = tempfile.mkdtemp(
-            prefix="ssd_table_batched_embeddings", dir=ssd_storage_directory
-        )
+        ssd_directory = ssd_storage_directory
         # logging.info("DEBUG: weights_precision {}".format(weights_precision))
+        """
+        ##################### for ZCH v.Next loading checkpoints Short Term Solution #######################
+        weight_id tensor is the weight and optimizer keys, to load from checkpoint, weight_id tensor
+        needs to be loaded first, then we can load the weight and optimizer tensors.
+        However, the stateful checkpoint loading does not guarantee the tensor loading order, so we need
+        to cache the weight_id, weight and optimizer tensors untils all data are loaded, then we can apply
+        them to backend.
+        Currently, we'll cache the weight_id, weight and optimizer tensors in the KVZCHCachedData class,
+        and apply them to backend when all data are loaded. The downside of this solution is that we'll
+        have to duplicate a whole tensor memory to backend before we can release the python tensor memory,
+        which is not ideal.
+        The longer term solution is to support the caching from the backend side, and allow streaming based
+        data move from cached weight and optimizer to key/value format without duplicate one whole tensor's
+        memory.
+        """
+        self._cached_kvzch_data: Optional[KVZCHCachedData] = None
+        # initial embedding rows on this rank per table, this is used for loading checkpoint
+        self.local_weight_counts: list[int] = [0] * T_
+        # groundtruth global id on this rank per table, this is used for loading checkpoint
+        self.global_id_per_rank: list[torch.Tensor] = [torch.zeros(0)] * T_
+        # loading checkpoint flag, set by checkpoint loader, and cleared after weight is applied to backend
+        self.load_state_dict: bool = False
+        SSDTableBatchedEmbeddingBags._all_tbe_instances.add(self)
+        if SSDTableBatchedEmbeddingBags._first_instance_ref is None:
+            SSDTableBatchedEmbeddingBags._first_instance_ref = weakref.ref(self)
         # create tbe unique id using rank index | local tbe idx
         if tbe_unique_id == -1:
             SSDTableBatchedEmbeddingBags._local_instance_index += 1
@@ -432,21 +623,26 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 logging.warning("dist is not initialized, treating as single gpu cases")
                 tbe_unique_id = SSDTableBatchedEmbeddingBags._local_instance_index
         self.tbe_unique_id = tbe_unique_id
+        self.l2_cache_size = l2_cache_size
         logging.info(f"tbe_unique_id: {tbe_unique_id}")
-        if not ps_hosts:
+        self.enable_free_mem_trigger_eviction: bool = False
+        if self.backend_type == BackendType.SSD:
             logging.info(
-                f"Logging SSD offloading setup, tbe_unique_id:{tbe_unique_id}, l2_cache_size:{l2_cache_size}GB, enable_async_update:{enable_async_update}"
-                f"passed_in_path={ssd_directory}, num_shards={ssd_rocksdb_shards},num_threads={ssd_rocksdb_shards},"
-                f"memtable_flush_period={ssd_memtable_flush_period},memtable_flush_offset={ssd_memtable_flush_offset},"
-                f"l0_files_per_compact={ssd_l0_files_per_compact},max_D={self.max_D},rate_limit_mbps={ssd_rate_limit_mbps},"
-                f"size_ratio={ssd_size_ratio},compaction_trigger={ssd_compaction_trigger}, lazy_bulk_init_enabled={lazy_bulk_init_enabled},"
-                f"write_buffer_size_per_tbe={ssd_rocksdb_write_buffer_size},max_write_buffer_num_per_db_shard={ssd_max_write_buffer_num},"
-                f"uniform_init_lower={ssd_uniform_init_lower},uniform_init_upper={ssd_uniform_init_upper},"
-                f"row_storage_bitwidth={weights_precision.bit_rate()},block_cache_size_per_tbe={ssd_block_cache_size_per_tbe},"
-                f"use_passed_in_path:{use_passed_in_path}, real_path will be printed in EmbeddingRocksDB"
+                f"Logging SSD offloading setup, tbe_unique_id:{tbe_unique_id}, l2_cache_size:{l2_cache_size}GB, "
+                f"enable_async_update:{enable_async_update}, passed_in_path={ssd_directory}, "
+                f"num_shards={ssd_rocksdb_shards}, num_threads={ssd_rocksdb_shards}, "
+                f"memtable_flush_period={ssd_memtable_flush_period}, memtable_flush_offset={ssd_memtable_flush_offset}, "
+                f"l0_files_per_compact={ssd_l0_files_per_compact}, max_D={self.max_D}, "
+                f"cache_row_size={self.cache_row_dim}, rate_limit_mbps={ssd_rate_limit_mbps}, "
+                f"size_ratio={ssd_size_ratio}, compaction_trigger={ssd_compaction_trigger}, "
+                f"lazy_bulk_init_enabled={lazy_bulk_init_enabled}, write_buffer_size_per_tbe={ssd_rocksdb_write_buffer_size}, "
+                f"max_write_buffer_num_per_db_shard={ssd_max_write_buffer_num}, "
+                f"uniform_init_lower={ssd_uniform_init_lower}, uniform_init_upper={ssd_uniform_init_upper}, "
+                f"row_storage_bitwidth={weights_precision.bit_rate()}, block_cache_size_per_tbe={ssd_block_cache_size_per_tbe}, "
+                f"use_passed_in_path:{use_passed_in_path}, real_path will be printed in EmbeddingRocksDB, "
+                f"enable_raw_embedding_streaming:{self.enable_raw_embedding_streaming}, flushing_block_size:{flushing_block_size}"
             )
             # pyre-fixme[4]: Attribute must be annotated.
-            # pyre-ignore[16]
             self._ssd_db = torch.classes.fbgemm.EmbeddingRocksDBWrapper(
                 ssd_directory,
                 ssd_rocksdb_shards,
@@ -454,7 +650,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 ssd_memtable_flush_period,
                 ssd_memtable_flush_offset,
                 ssd_l0_files_per_compact,
-                self.max_D,
+                self.cache_row_dim,
                 ssd_rate_limit_mbps,
                 ssd_size_ratio,
                 ssd_compaction_trigger,
@@ -468,6 +664,24 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 tbe_unique_id,
                 l2_cache_size,
                 enable_async_update,
+                self.enable_raw_embedding_streaming,
+                self.res_params.res_store_shards,
+                self.res_params.res_server_port,
+                self.res_params.table_names,
+                self.res_params.table_offsets,
+                self.res_params.table_sizes,
+                (
+                    tensor_pad4(self.table_dims)
+                    if self.enable_optimizer_offloading
+                    else None
+                ),
+                (
+                    self.table_hash_size_cumsum.cpu()
+                    if self.enable_optimizer_offloading
+                    else None
+                ),
+                flushing_block_size,
+                self._embedding_cache_mode,  # disable_random_init
             )
             if self.bulk_init_chunk_size > 0:
                 self.ssd_uniform_init_lower: float = ssd_uniform_init_lower
@@ -476,11 +690,9 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     self._lazy_initialize_ssd_tbe()
                 else:
                     self._insert_all_kv()
-        else:
-            # pyre-fixme[4]: Attribute must be annotated.
-            # pyre-ignore[16]
+        elif self.backend_type == BackendType.PS:
             self._ssd_db = torch.classes.fbgemm.EmbeddingParameterServerWrapper(
-                [host[0] for host in ps_hosts],
+                [host[0] for host in ps_hosts],  # pyre-ignore
                 [host[1] for host in ps_hosts],
                 tbe_unique_id,
                 (
@@ -491,14 +703,98 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 ps_client_thread_num if ps_client_thread_num is not None else 32,
                 ps_max_key_per_request if ps_max_key_per_request is not None else 500,
                 l2_cache_size,
-                self.max_D,
+                self.cache_row_dim,
+            )
+        elif self.backend_type == BackendType.DRAM:
+            logging.info(
+                f"Logging DRAM offloading setup, tbe_unique_id:{tbe_unique_id}, l2_cache_size:{l2_cache_size}GB,"
+                f"num_shards={ssd_rocksdb_shards},num_threads={ssd_rocksdb_shards},"
+                f"max_D={self.max_D},"
+                f"uniform_init_lower={ssd_uniform_init_lower},uniform_init_upper={ssd_uniform_init_upper},"
+                f"row_storage_bitwidth={weights_precision.bit_rate()},"
+                f"self.cache_row_dim={self.cache_row_dim},"
+                f"enable_optimizer_offloading={self.enable_optimizer_offloading},"
+                f"feature_dims={self.feature_dims},"
+                f"hash_size_cumsum={self.hash_size_cumsum},"
+                f"backend_return_whole_row={self.backend_return_whole_row}"
             )
+            table_dims = (
+                tensor_pad4(self.table_dims)
+                if self.enable_optimizer_offloading
+                else None
+            )  # table_dims
+            eviction_config = None
+            if self.kv_zch_params and self.kv_zch_params.eviction_policy:
+                eviction_mem_threshold_gb = (
+                    self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
+                    if self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
+                    else self.l2_cache_size
+                )
+                kv_zch_params = self.kv_zch_params
+                eviction_policy = self.kv_zch_params.eviction_policy
+                if eviction_policy.eviction_trigger_mode == 5:
+                    # If trigger mode is free_mem(5), populate config
+                    self.set_free_mem_eviction_trigger_config(eviction_policy)
+                enable_eviction_for_feature_score_eviction_policy = (  # pytorch api in c++ doesn't support vertor<bool>, convert to int here, 0: no eviction 1: eviction
+                    [
+                        int(x)
+                        for x in eviction_policy.enable_eviction_for_feature_score_eviction_policy
+                    ]
+                    if eviction_policy.enable_eviction_for_feature_score_eviction_policy
+                    is not None
+                    else None
+                )
+                # Please refer to https://fburl.com/gdoc/nuupjwqq for the following eviction parameters.
+                eviction_config = torch.classes.fbgemm.FeatureEvictConfig(
+                    eviction_policy.eviction_trigger_mode,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual, 4: id count
+                    eviction_policy.eviction_strategy,  # evict_trigger_strategy: 0: timestamp, 1: counter, 2: counter + timestamp, 3: feature l2 norm, 4: timestamp threshold 5: feature score
+                    eviction_policy.eviction_step_intervals,  # trigger_step_interval if trigger mode is iteration
+                    eviction_mem_threshold_gb,  # mem_util_threshold_in_GB if trigger mode is mem_util
+                    eviction_policy.ttls_in_mins,  # ttls_in_mins for each table if eviction strategy is timestamp
+                    eviction_policy.counter_thresholds,  # counter_thresholds for each table if eviction strategy is counter
+                    eviction_policy.counter_decay_rates,  # counter_decay_rates for each table if eviction strategy is counter
+                    eviction_policy.feature_score_counter_decay_rates,  # feature_score_counter_decay_rates for each table if eviction strategy is feature score
+                    eviction_policy.training_id_eviction_trigger_count,  # training_id_eviction_trigger_count for each table
+                    eviction_policy.training_id_keep_count,  # training_id_keep_count for each table
+                    enable_eviction_for_feature_score_eviction_policy,  # no eviction setting for feature score eviction policy
+                    eviction_policy.l2_weight_thresholds,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                    table_dims.tolist() if table_dims is not None else None,
+                    eviction_policy.threshold_calculation_bucket_stride,  # threshold_calculation_bucket_stride if eviction strategy is feature score
+                    eviction_policy.threshold_calculation_bucket_num,  # threshold_calculation_bucket_num if eviction strategy is feature score
+                    eviction_policy.interval_for_insufficient_eviction_s,
+                    eviction_policy.interval_for_sufficient_eviction_s,
+                    eviction_policy.interval_for_feature_statistics_decay_s,
+                )
+            self._ssd_db = torch.classes.fbgemm.DramKVEmbeddingCacheWrapper(
+                self.cache_row_dim,
+                ssd_uniform_init_lower,
+                ssd_uniform_init_upper,
+                eviction_config,
+                ssd_rocksdb_shards,  # num_shards
+                ssd_rocksdb_shards,  # num_threads
+                weights_precision.bit_rate(),  # row_storage_bitwidth
+                table_dims,
+                (
+                    self.table_hash_size_cumsum.cpu()
+                    if self.enable_optimizer_offloading
+                    else None
+                ),  # hash_size_cumsum
+                self.backend_return_whole_row,  # backend_return_whole_row
+                False,  # enable_async_update
+                self._embedding_cache_mode,  # disable_random_init
+            )
+        else:
+            raise AssertionError(f"Invalid backend type {self.backend_type}")
         # pyre-fixme[20]: Argument `self` expected.
-        (low_priority, high_priority) = torch.cuda.Stream.priority_range()
+        low_priority, high_priority = torch.cuda.Stream.priority_range()
         # GPU stream for SSD cache eviction
         self.ssd_eviction_stream = torch.cuda.Stream(priority=low_priority)
-        # GPU stream for SSD memory copy
+        # GPU stream for SSD memory copy (also reused for feature score D2H)
         self.ssd_memcpy_stream = torch.cuda.Stream(priority=low_priority)
+        # GPU stream for async metadata operation
+        self.feature_score_stream = torch.cuda.Stream(priority=low_priority)
         # SSD get completion event
         self.ssd_event_get = torch.cuda.Event()
@@ -510,26 +806,93 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.ssd_event_backward = torch.cuda.Event()
         # SSD get's input copy completion event
         self.ssd_event_get_inputs_cpy = torch.cuda.Event()
+        if self._embedding_cache_mode:
+            # Direct write embedding completion event
+            self.direct_write_l1_complete_event: torch.cuda.streams.Event = (
+                torch.cuda.Event()
+            )
+            self.direct_write_sp_complete_event: torch.cuda.streams.Event = (
+                torch.cuda.Event()
+            )
+        # Prefetch operation completion event
+        self.prefetch_complete_event = torch.cuda.Event()
         if self.prefetch_pipeline:
             # SSD scratch pad index queue insert completion event
             self.ssd_event_sp_idxq_insert: torch.cuda.streams.Event = torch.cuda.Event()
             # SSD scratch pad index queue lookup completion event
             self.ssd_event_sp_idxq_lookup: torch.cuda.streams.Event = torch.cuda.Event()
-        self.timesteps_prefetched: List[int] = []
+        if self.enable_raw_embedding_streaming:
+            # RES reuse the eviction stream
+            self.ssd_event_cache_streamed: torch.cuda.streams.Event = torch.cuda.Event()
+            self.ssd_event_cache_streaming_synced: torch.cuda.streams.Event = (
+                torch.cuda.Event()
+            )
+            self.ssd_event_cache_streaming_computed: torch.cuda.streams.Event = (
+                torch.cuda.Event()
+            )
+            self.ssd_event_sp_streamed: torch.cuda.streams.Event = torch.cuda.Event()
+            # Updated buffers
+            self.register_buffer(
+                "lxu_cache_updated_weights",
+                torch.ops.fbgemm.new_unified_tensor(
+                    torch.zeros(
+                        1,
+                        device=self.current_device,
+                        dtype=cache_dtype,
+                    ),
+                    self.lxu_cache_weights.shape,
+                    is_host_mapped=self.uvm_host_mapped,
+                ),
+            )
+            # For storing embedding indices to update to
+            self.register_buffer(
+                "lxu_cache_updated_indices",
+                torch.ops.fbgemm.new_unified_tensor(
+                    torch.zeros(
+                        1,
+                        device=self.current_device,
+                        dtype=torch.long,
+                    ),
+                    (self.lxu_cache_weights.shape[0],),
+                    is_host_mapped=self.uvm_host_mapped,
+                ),
+            )
+            # For storing the number of updated rows
+            self.register_buffer(
+                "lxu_cache_updated_count",
+                torch.ops.fbgemm.new_unified_tensor(
+                    torch.zeros(
+                        1,
+                        device=self.current_device,
+                        dtype=torch.int,
+                    ),
+                    (1,),
+                    is_host_mapped=self.uvm_host_mapped,
+                ),
+            )
+            # (Indices, Count)
+            self.prefetched_info: list[tuple[Tensor, Tensor]] = []
+        self.timesteps_prefetched: list[int] = []
         # TODO: add type annotation
         # pyre-fixme[4]: Attribute must be annotated.
         self.ssd_prefetch_data = []
         # Scratch pad eviction data queue
-        self.ssd_scratch_pad_eviction_data: List[
-            Tuple[Tensor, Tensor, Tensor, bool]
+        self.ssd_scratch_pad_eviction_data: list[
+            tuple[Tensor, Tensor, Tensor, bool]
         ] = []
-        self.ssd_location_update_data: List[Tuple[Tensor, Tensor]] = []
+        self.ssd_location_update_data: list[tuple[Tensor, Tensor]] = []
         if self.prefetch_pipeline:
             # Scratch pad value queue
-            self.ssd_scratch_pads: List[Tuple[Tensor, Tensor, Tensor]] = []
+            self.ssd_scratch_pads: list[tuple[Tensor, Tensor, Tensor]] = []
             # pyre-ignore[4]
             # Scratch pad index queue
@@ -549,12 +912,15 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             )
         cowclip_regularization = CowClipDefinition()
+        self.learning_rate_tensor: torch.Tensor = torch.tensor(
+            learning_rate, device=torch.device("cpu"), dtype=torch.float32
+        )
         self.optimizer_args = invokers.lookup_args_ssd.OptimizerArgs(
             stochastic_rounding=stochastic_rounding,
             gradient_clipping=gradient_clipping,
             max_gradient=max_gradient,
             max_norm=max_norm,
-            learning_rate=learning_rate,
             eps=eps,
             beta1=beta1,
             beta2=beta2,
@@ -575,7 +941,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             weight_norm_coefficient=cowclip_regularization.weight_norm_coefficient,
             lower_bound=cowclip_regularization.lower_bound,
             regularization_mode=weight_decay_mode.value,
-            use_rowwise_bias_correction=False,  # Unused, this is used in TBE's Adam
+            use_rowwise_bias_correction=use_rowwise_bias_correction,  # Used in Adam optimizer
         )
         table_embedding_dtype = weights_precision.as_dtype()
@@ -593,19 +959,14 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             dtype=table_embedding_dtype,
         )
-        momentum1_offsets = [0] + list(itertools.accumulate(rows))
-        self._apply_split(
-            SplitState(
-                dev_size=self.total_hash_size,
-                host_size=0,
-                uvm_size=0,
-                placements=[EmbeddingLocation.DEVICE for _ in range(T_)],
-                offsets=momentum1_offsets[:-1],
-            ),
-            "momentum1",
+        # Create the optimizer state tensors
+        for template in self.optimizer.ssd_state_splits(
+            self.embedding_specs,
+            self.optimizer_state_dtypes,
+            self.enable_optimizer_offloading,
+        ):
             # pyre-fixme[6]: For 3rd argument expected `Type[dtype]` but got `dtype`.
-            dtype=torch.float32,
-        )
+            self._apply_split(*template)
         # For storing current iteration data
         self.current_iter_data: Optional[IterData] = None
@@ -625,11 +986,6 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 self._update_cache_counter_and_pointers
             )
-        assert optimizer in (
-            OptimType.EXACT_ROWWISE_ADAGRAD,
-        ), f"Optimizer {optimizer} is not supported by SSDTableBatchedEmbeddingBags"
-        self.optimizer = optimizer
         # stats reporter
         self.gather_ssd_cache_stats = gather_ssd_cache_stats
         self.stats_reporter: Optional[TBEStatsReporter] = (
@@ -638,7 +994,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.ssd_cache_stats_size = 6
         # 0: N_calls, 1: N_requested_indices, 2: N_unique_indices, 3: N_unique_misses,
         # 4: N_conflict_unique_misses, 5: N_conflict_misses
-        self.last_reported_ssd_stats: List[float] = []
+        self.last_reported_ssd_stats: list[float] = []
         self.last_reported_step = 0
         self.register_buffer(
@@ -669,7 +1025,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.prefetch_parallel_stream_cnt: int = 2
         # tuple of iteration, prefetch parallel stream cnt, reported duration
         # since there are 2 stream in parallel in prefetch, we want to count the longest one
-        self.prefetch_duration_us: Tuple[int, int, float] = (
+        self.prefetch_duration_us: tuple[int, int, float] = (
             -1,
             self.prefetch_parallel_stream_cnt,
             0,
@@ -689,6 +1045,26 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.l2_cache_capacity_stats_name: str = (
             f"l2_cache.mem.tbe_id{tbe_unique_id}.capacity_bytes"
         )
+        self.dram_kv_actual_used_chunk_bytes_stats_name: str = (
+            f"dram_kv.mem.tbe_id{tbe_unique_id}.actual_used_chunk_bytes"
+        )
+        self.dram_kv_allocated_bytes_stats_name: str = (
+            f"dram_kv.mem.tbe_id{tbe_unique_id}.allocated_bytes"
+        )
+        self.dram_kv_mem_num_rows_stats_name: str = (
+            f"dram_kv.mem.tbe_id{tbe_unique_id}.num_rows"
+        )
+        self.eviction_sum_evicted_counts_stats_name: str = (
+            f"eviction.tbe_id.{tbe_unique_id}.sum_evicted_counts"
+        )
+        self.eviction_sum_processed_counts_stats_name: str = (
+            f"eviction.tbe_id.{tbe_unique_id}.sum_processed_counts"
+        )
+        self.eviction_evict_rate_stats_name: str = (
+            f"eviction.tbe_id.{tbe_unique_id}.evict_rate"
+        )
         if self.stats_reporter:
             self.ssd_prefetch_read_timer = AsyncSeriesTimer(
                 functools.partial(
@@ -708,11 +1084,77 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             )
             # pyre-ignore
             self.stats_reporter.register_stats(self.l2_num_cache_misses_stats_name)
-            # pyre-ignore
             self.stats_reporter.register_stats(self.l2_num_cache_lookups_stats_name)
             self.stats_reporter.register_stats(self.l2_num_cache_evictions_stats_name)
             self.stats_reporter.register_stats(self.l2_cache_free_mem_stats_name)
             self.stats_reporter.register_stats(self.l2_cache_capacity_stats_name)
+            self.stats_reporter.register_stats(self.dram_kv_allocated_bytes_stats_name)
+            self.stats_reporter.register_stats(
+                self.dram_kv_actual_used_chunk_bytes_stats_name
+            )
+            self.stats_reporter.register_stats(self.dram_kv_mem_num_rows_stats_name)
+            self.stats_reporter.register_stats(
+                self.eviction_sum_evicted_counts_stats_name
+            )
+            self.stats_reporter.register_stats(
+                self.eviction_sum_processed_counts_stats_name
+            )
+            self.stats_reporter.register_stats(self.eviction_evict_rate_stats_name)
+            for t in self.feature_table_map:
+                self.stats_reporter.register_stats(
+                    f"eviction.feature_table.{t}.evicted_counts"
+                )
+                self.stats_reporter.register_stats(
+                    f"eviction.feature_table.{t}.processed_counts"
+                )
+                self.stats_reporter.register_stats(
+                    f"eviction.feature_table.{t}.evict_rate"
+                )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.full_duration_ms"
+            )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.exec_duration_ms"
+            )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.dry_run_exec_duration_ms"
+            )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.exec_div_full_duration_rate"
+            )
+        self.bounds_check_version: int = get_bounds_check_version_for_platform()
+        self._pg = pg
+    @cached_property
+    def cache_row_dim(self) -> int:
+        """
+        Compute the effective physical cache row size taking into account
+        padding to the nearest 4 elements and the optimizer state appended to
+        the back of the row
+        """
+        # For st publish, we only need to load weight for publishing and bulk eval
+        if self.enable_optimizer_offloading and not self.load_ckpt_without_opt:
+            return self.max_D + pad4(
+                # Compute the number of elements of cache_dtype needed to store
+                # the optimizer state
+                self.optimizer_state_dim
+            )
+        else:
+            return self.max_D
+    @cached_property
+    def optimizer_state_dim(self) -> int:
+        return int(
+            math.ceil(
+                self.optimizer.state_size_nbytes(
+                    self.max_D, self.optimizer_state_dtypes
+                )
+                / self.weights_precision.as_dtype().itemsize
+            )
+        )
     @property
     # pyre-ignore
@@ -766,19 +1208,22 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         be effectively overwritten. This function should only be called once at
         initailization time.
         """
+        self._ssd_db.toggle_compaction(False)
         row_offset = 0
         row_count = floor(
             self.bulk_init_chunk_size
-            / (self.max_D * self.weights_precision.as_dtype().itemsize)
+            / (self.cache_row_dim * self.weights_precision.as_dtype().itemsize)
         )
         total_dim0 = 0
         for dim0, _ in self.embedding_specs:
             total_dim0 += dim0
         start_ts = time.time()
+        # TODO: do we have case for non-kvzch ssd with bulk init enabled + optimizer offloading? probably not?
+        #       if we have such cases, we should only init the emb dim not the optimizer dim
         chunk_tensor = torch.empty(
             row_count,
-            self.max_D,
+            self.cache_row_dim,
             dtype=self.weights_precision.as_dtype(),
             device="cuda",
         )
@@ -793,12 +1238,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             # This code is intentionally not calling through the getter property
             # to avoid the lazy initialization thread from joining with itself.
             self._ssd_db.set_range_to_storage(rand_val, row_offset, actual_dim0)
-        self.ssd_db.toggle_compaction(True)
         end_ts = time.time()
         elapsed = int((end_ts - start_ts) * 1e6)
         logging.info(
             f"TBE bulk initialization took {elapsed:_} us, bulk_init_chunk_size={self.bulk_init_chunk_size}, each batch of {row_count} rows, total rows of {total_dim0}"
         )
+        self._ssd_db.toggle_compaction(True)
     @torch.jit.ignore
     def _report_duration(
@@ -826,7 +1271,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         """
         recorded_itr, stream_cnt, report_val = self.prefetch_duration_us
         duration = dur_ms
-        if time_unit == "us":  # pyre-ignore
+        if time_unit == "us":
             duration = dur_ms * 1000
         if it_step == recorded_itr:
             report_val = max(report_val, duration)
@@ -845,7 +1290,6 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 it_step, event_name, report_val, time_unit=time_unit
             )
-    # pyre-ignore[3]
     def record_function_via_dummy_profile_factory(
         self,
         use_dummy_profile: bool,
@@ -867,7 +1311,6 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             def func(
                 name: str,
-                # pyre-ignore[2]
                 fn: Callable[..., Any],
                 *args: Any,
                 **kwargs: Any,
@@ -881,7 +1324,6 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         def func(
             name: str,
-            # pyre-ignore[2]
             fn: Callable[..., Any],
             *args: Any,
             **kwargs: Any,
@@ -894,10 +1336,10 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self,
         split: SplitState,
         prefix: str,
-        dtype: Type[torch.dtype],
+        dtype: type[torch.dtype],
         enforce_hbm: bool = False,
         make_dev_param: bool = False,
-        dev_reshape: Optional[Tuple[int, ...]] = None,
+        dev_reshape: Optional[tuple[int, ...]] = None,
     ) -> None:
         apply_split_helper(
             self.register_buffer,
@@ -920,11 +1362,11 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def to_pinned_cpu_on_stream_wait_on_another_stream(
         self,
-        tensors: List[Tensor],
+        tensors: list[Tensor],
         stream: torch.cuda.Stream,
         stream_to_wait_on: torch.cuda.Stream,
         post_event: Optional[torch.cuda.Event] = None,
-    ) -> List[Tensor]:
+    ) -> list[Tensor]:
         """
         Transfer input tensors from GPU to CPU using a pinned host
         buffer.  The transfer is carried out on the given stream
@@ -982,9 +1424,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             is_rows_uvm (bool): A flag to indicate whether `rows` is a UVM
                                 tensor (which is accessible on both host and
                                 device)
+            is_bwd (bool): A flag to indicate if the eviction is during backward
         Returns:
             None
         """
+        if not self.training:  # if not training, freeze the embedding
+            return
         with record_function(f"## ssd_evict_{name} ##"):
             with torch.cuda.stream(stream):
                 if pre_event is not None:
@@ -1007,6 +1452,95 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 if post_event is not None:
                     stream.record_event(post_event)
+    def raw_embedding_stream_sync(
+        self,
+        stream: torch.cuda.Stream,
+        pre_event: Optional[torch.cuda.Event],
+        post_event: Optional[torch.cuda.Event],
+        name: Optional[str] = "",
+    ) -> None:
+        """
+        Blocking wait the copy operation of the tensors to be streamed,
+        to make sure they are not overwritten
+        Args:
+            stream (Stream): The CUDA stream that cudaStreamAddCallback will
+                             synchronize the host function with.  Moreover, the
+                             asynchronous D->H memory copies will operate on
+                             this stream
+            pre_event (Event): The CUDA event that the stream has to wait on
+            post_event (Event): The CUDA event that the current will record on
+                                when the eviction is done
+        Returns:
+            None
+        """
+        with record_function(f"## ssd_stream_{name} ##"):
+            with torch.cuda.stream(stream):
+                if pre_event is not None:
+                    stream.wait_event(pre_event)
+                self.record_function_via_dummy_profile(
+                    f"## ssd_stream_sync_{name} ##",
+                    self.ssd_db.stream_sync_cuda,
+                )
+                if post_event is not None:
+                    stream.record_event(post_event)
+    def raw_embedding_stream(
+        self,
+        rows: Tensor,
+        indices_cpu: Tensor,
+        actions_count_cpu: Tensor,
+        stream: torch.cuda.Stream,
+        pre_event: Optional[torch.cuda.Event],
+        post_event: Optional[torch.cuda.Event],
+        is_rows_uvm: bool,
+        blocking_tensor_copy: bool = True,
+        name: Optional[str] = "",
+    ) -> None:
+        """
+        Stream data from the given input tensors to a remote service
+        Args:
+            rows (Tensor): The 2D tensor that contains rows to evict
+            indices_cpu (Tensor): The 1D CPU tensor that contains the row
+                                  indices that the rows will be evicted to
+            actions_count_cpu (Tensor): A scalar tensor that contains the
+                                        number of rows that the evict function
+                                        has to process
+            stream (Stream): The CUDA stream that cudaStreamAddCallback will
+                             synchronize the host function with.  Moreover, the
+                             asynchronous D->H memory copies will operate on
+                             this stream
+            pre_event (Event): The CUDA event that the stream has to wait on
+            post_event (Event): The CUDA event that the current will record on
+                                when the eviction is done
+            is_rows_uvm (bool): A flag to indicate whether `rows` is a UVM
+                                tensor (which is accessible on both host and
+                                device)
+        Returns:
+            None
+        """
+        with record_function(f"## ssd_stream_{name} ##"):
+            with torch.cuda.stream(stream):
+                if pre_event is not None:
+                    stream.wait_event(pre_event)
+                rows_cpu = rows if is_rows_uvm else self.to_pinned_cpu(rows)
+                rows.record_stream(stream)
+                self.record_function_via_dummy_profile(
+                    f"## ssd_stream_{name} ##",
+                    self.ssd_db.stream_cuda,
+                    indices_cpu,
+                    rows_cpu,
+                    actions_count_cpu,
+                    blocking_tensor_copy,
+                )
+                if post_event is not None:
+                    stream.record_event(post_event)
     def _evict_from_scratch_pad(self, grad: Tensor) -> None:
         """
         Evict conflict missed rows from a scratch pad
@@ -1043,6 +1577,18 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             if not do_evict:
                 return
+            if self.enable_raw_embedding_streaming:
+                self.raw_embedding_stream(
+                    rows=inserted_rows,
+                    indices_cpu=post_bwd_evicted_indices_cpu,
+                    actions_count_cpu=actions_count_cpu,
+                    stream=self.ssd_eviction_stream,
+                    pre_event=self.ssd_event_backward,
+                    post_event=self.ssd_event_sp_streamed,
+                    is_rows_uvm=True,
+                    blocking_tensor_copy=True,
+                    name="scratch_pad",
+                )
             self.evict(
                 rows=inserted_rows,
                 indices_cpu=post_bwd_evicted_indices_cpu,
@@ -1060,7 +1606,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def _update_cache_counter_and_pointers(
         self,
         module: nn.Module,
-        grad_input: Union[Tuple[Tensor, ...], Tensor],
+        grad_input: Union[tuple[Tensor, ...], Tensor],
     ) -> None:
         """
         Update cache line locking counter and pointers before backward
@@ -1145,9 +1691,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             if len(self.ssd_location_update_data) == 0:
                 return
-            (sp_curr_next_map, inserted_rows_next) = self.ssd_location_update_data.pop(
-                0
-            )
+            sp_curr_next_map, inserted_rows_next = self.ssd_location_update_data.pop(0)
             # Update poitners
             torch.ops.fbgemm.ssd_update_row_addrs(
@@ -1162,12 +1706,63 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 unique_indices_length_curr=curr_data.actions_count_gpu,
             )
+    def _update_feature_score_metadata(
+        self,
+        linear_cache_indices: Tensor,
+        weights: Tensor,
+        d2h_stream: torch.cuda.Stream,
+        write_stream: torch.cuda.Stream,
+        pre_event_for_write: torch.cuda.Event,
+        post_event: Optional[torch.cuda.Event] = None,
+    ) -> None:
+        """
+        Write feature score metadata to DRAM
+        This method performs D2H copy on d2h_stream, then writes to DRAM on write_stream.
+        The caller is responsible for ensuring d2h_stream doesn't compete with other D2H operations.
+        Args:
+            linear_cache_indices: GPU tensor containing cache indices
+            weights: GPU tensor containing feature scores
+            d2h_stream: Stream for D2H copy operation (should already be synchronized appropriately)
+            write_stream: Stream for metadata write operation
+            pre_event_for_write: Event to wait on before writing metadata (e.g., wait for eviction)
+            post_event: Event to record when the operation is done
+        """
+        # Start D2H copy on d2h_stream
+        with torch.cuda.stream(d2h_stream):
+            # Record streams to prevent premature deallocation
+            linear_cache_indices.record_stream(d2h_stream)
+            weights.record_stream(d2h_stream)
+            # Do the D2H copy
+            linear_cache_indices_cpu = self.to_pinned_cpu(linear_cache_indices)
+            score_weights_cpu = self.to_pinned_cpu(weights)
+        # Write feature score metadata to DRAM
+        with record_function("## ssd_write_feature_score_metadata ##"):
+            with torch.cuda.stream(write_stream):
+                write_stream.wait_event(pre_event_for_write)
+                write_stream.wait_stream(d2h_stream)
+                self.record_function_via_dummy_profile(
+                    "## ssd_write_feature_score_metadata ##",
+                    self.ssd_db.set_feature_score_metadata_cuda,
+                    linear_cache_indices_cpu,
+                    torch.tensor(
+                        [score_weights_cpu.shape[0]], device="cpu", dtype=torch.long
+                    ),
+                    score_weights_cpu,
+                )
+                if post_event is not None:
+                    write_stream.record_event(post_event)
     def prefetch(
         self,
         indices: Tensor,
         offsets: Tensor,
+        weights: Optional[Tensor] = None,  # todo: need to update caller
         forward_stream: Optional[torch.cuda.Stream] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
     ) -> None:
         if self.prefetch_stream is None and forward_stream is not None:
             # Set the prefetch stream to the current stream
@@ -1191,6 +1786,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self._prefetch(
             indices,
             offsets,
+            weights,
             vbe_metadata,
             forward_stream,
         )
@@ -1199,11 +1795,17 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self,
         indices: Tensor,
         offsets: Tensor,
+        weights: Optional[Tensor] = None,
         vbe_metadata: Optional[invokers.lookup_args.VBEMetadata] = None,
         forward_stream: Optional[torch.cuda.Stream] = None,
     ) -> None:
-        # TODO: Refactor prefetch
+        # Wait for any ongoing direct_write_embedding operations to complete
+        # Moving this from forward() to _prefetch() is more logical as direct_write
+        # operations affect the same cache structures that prefetch interacts with
         current_stream = torch.cuda.current_stream()
+        if self._embedding_cache_mode:
+            current_stream.wait_event(self.direct_write_l1_complete_event)
+            current_stream.wait_event(self.direct_write_sp_complete_event)
         B_offsets = None
         max_B = -1
@@ -1284,10 +1886,83 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 masks=torch.where(evicted_indices != -1, 1, 0),
                 count=actions_count_gpu,
             )
-            with record_function("## ssd_d2h_inserted_indices ##"):
-                # Transfer actions_count and insert_indices right away to
-                # incrase an overlap opportunity
+            has_raw_embedding_streaming = False
+            if self.enable_raw_embedding_streaming:
+                # when pipelining is enabled
+                # prefetch in iter i happens before the backward sparse in iter i - 1
+                # so embeddings for iter i - 1's changed ids are not updated.
+                # so we can only fetch the indices from the iter i - 2
+                # when pipelining is disabled
+                # prefetch in iter i happens before forward iter i
+                # so we can get the iter i - 1's changed ids safely.
+                target_prev_iter = 1
+                if self.prefetch_pipeline:
+                    target_prev_iter = 2
+                if len(self.prefetched_info) > (target_prev_iter - 1):
+                    with record_function(
+                        "## ssd_lookup_prefetched_rows {} {} ##".format(
+                            self.timestep, self.tbe_unique_id
+                        )
+                    ):
+                        # wait for the copy to finish before overwriting the buffer
+                        self.raw_embedding_stream_sync(
+                            stream=self.ssd_eviction_stream,
+                            pre_event=self.ssd_event_cache_streamed,
+                            post_event=self.ssd_event_cache_streaming_synced,
+                            name="cache_update",
+                        )
+                        current_stream.wait_event(self.ssd_event_cache_streaming_synced)
+                        updated_indices, updated_counts_gpu = self.prefetched_info.pop(
+                            0
+                        )
+                        self.lxu_cache_updated_indices[: updated_indices.size(0)].copy_(
+                            updated_indices,
+                            non_blocking=True,
+                        )
+                        self.lxu_cache_updated_count[:1].copy_(
+                            updated_counts_gpu, non_blocking=True
+                        )
+                        has_raw_embedding_streaming = True
+                with record_function(
+                    "## ssd_save_prefetched_rows {} {} ##".format(
+                        self.timestep, self.tbe_unique_id
+                    )
+                ):
+                    masked_updated_indices = torch.where(
+                        torch.where(lxu_cache_locations != -1, True, False),
+                        linear_cache_indices,
+                        -1,
+                    )
+                    (
+                        uni_updated_indices,
+                        uni_updated_indices_length,
+                    ) = get_unique_indices_v2(
+                        masked_updated_indices,
+                        self.total_hash_size,
+                        compute_count=False,
+                        compute_inverse_indices=False,
+                    )
+                    assert uni_updated_indices is not None
+                    assert uni_updated_indices_length is not None
+                    # The unique indices has 1 more -1 element than needed,
+                    # which might make the tensor length go out of range
+                    # compared to the pre-allocated buffer.
+                    unique_len = min(
+                        self.lxu_cache_weights.size(0),
+                        uni_updated_indices.size(0),
+                    )
+                    self.prefetched_info.append(
+                        (
+                            uni_updated_indices.narrow(0, 0, unique_len),
+                            uni_updated_indices_length.clamp(max=unique_len),
+                        )
+                    )
+            with record_function("## ssd_d2h_inserted_indices ##"):
+                # Transfer actions_count and insert_indices right away to
+                # incrase an overlap opportunity
                 actions_count_cpu, inserted_indices_cpu = (
                     self.to_pinned_cpu_on_stream_wait_on_another_stream(
                         tensors=[
@@ -1312,7 +1987,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             # Allocation a scratch pad for the current iteration. The scratch
             # pad is a UVA tensor
-            inserted_rows_shape = (assigned_cache_slots.numel(), self.max_D)
+            inserted_rows_shape = (assigned_cache_slots.numel(), self.cache_row_dim)
             if linear_cache_indices.numel() > 0:
                 inserted_rows = torch.ops.fbgemm.new_unified_tensor(
                     torch.zeros(
@@ -1415,25 +2090,66 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     # Store info for evicting the previous iteration's
                     # scratch pad after the corresponding backward pass is
                     # done
-                    self.ssd_location_update_data.append(
-                        (
-                            sp_curr_prev_map_gpu,
-                            inserted_rows,
+                    if self.training:
+                        self.ssd_location_update_data.append(
+                            (
+                                sp_curr_prev_map_gpu,
+                                inserted_rows,
+                            )
                         )
-                    )
             # Ensure the previous iterations eviction is complete
             current_stream.wait_event(self.ssd_event_sp_evict)
             # Ensure that D2H is done
             current_stream.wait_event(self.ssd_event_get_inputs_cpy)
+            if self.enable_raw_embedding_streaming and has_raw_embedding_streaming:
+                current_stream.wait_event(self.ssd_event_sp_streamed)
+                with record_function(
+                    "## ssd_compute_updated_rows {} {} ##".format(
+                        self.timestep, self.tbe_unique_id
+                    )
+                ):
+                    # cache rows that are changed in the previous iteration
+                    updated_cache_locations = torch.ops.fbgemm.lxu_cache_lookup(
+                        self.lxu_cache_updated_indices,
+                        self.lxu_cache_state,
+                        self.total_hash_size,
+                        self.gather_ssd_cache_stats,
+                        self.local_ssd_cache_stats,
+                    )
+                    torch.ops.fbgemm.masked_index_select(
+                        self.lxu_cache_updated_weights,
+                        updated_cache_locations,
+                        self.lxu_cache_weights,
+                        self.lxu_cache_updated_count,
+                    )
+                current_stream.record_event(self.ssd_event_cache_streaming_computed)
+                self.raw_embedding_stream(
+                    rows=self.lxu_cache_updated_weights,
+                    indices_cpu=self.lxu_cache_updated_indices,
+                    actions_count_cpu=self.lxu_cache_updated_count,
+                    stream=self.ssd_eviction_stream,
+                    pre_event=self.ssd_event_cache_streaming_computed,
+                    post_event=self.ssd_event_cache_streamed,
+                    is_rows_uvm=True,
+                    blocking_tensor_copy=False,
+                    name="cache_update",
+                )
             if self.gather_ssd_cache_stats:
                 # call to collect past SSD IO dur right before next rocksdb IO
                 self.ssd_cache_stats = torch.add(
                     self.ssd_cache_stats, self.local_ssd_cache_stats
                 )
-                self._report_ssd_stats()
+                # only report metrics from rank0 to avoid flooded logging
+                if dist.get_rank() == 0:
+                    self._report_kv_backend_stats()
+            # May trigger eviction if free mem trigger mode enabled before get cuda
+            self.may_trigger_eviction()
             # Fetch data from SSD
             if linear_cache_indices.numel() > 0:
@@ -1457,21 +2173,35 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 use_pipeline=self.prefetch_pipeline,
             )
-            if linear_cache_indices.numel() > 0:
-                # Evict rows from cache to SSD
-                self.evict(
-                    rows=self.lxu_cache_evicted_weights,
-                    indices_cpu=self.lxu_cache_evicted_indices,
-                    actions_count_cpu=self.lxu_cache_evicted_count,
-                    stream=self.ssd_eviction_stream,
-                    pre_event=self.ssd_event_get,
-                    # Record completion event after scratch pad eviction
-                    # instead since that happens after L1 eviction
-                    post_event=self.ssd_event_cache_evict,
-                    is_rows_uvm=True,
-                    name="cache",
-                    is_bwd=False,
-                )
+            if self.training:
+                if linear_cache_indices.numel() > 0:
+                    # Evict rows from cache to SSD
+                    self.evict(
+                        rows=self.lxu_cache_evicted_weights,
+                        indices_cpu=self.lxu_cache_evicted_indices,
+                        actions_count_cpu=self.lxu_cache_evicted_count,
+                        stream=self.ssd_eviction_stream,
+                        pre_event=self.ssd_event_get,
+                        # Record completion event after scratch pad eviction
+                        # instead since that happens after L1 eviction
+                        post_event=self.ssd_event_cache_evict,
+                        is_rows_uvm=True,
+                        name="cache",
+                        is_bwd=False,
+                    )
+                if (
+                    self.backend_type == BackendType.DRAM
+                    and weights is not None
+                    and linear_cache_indices.numel() > 0
+                ):
+                    # Reuse ssd_memcpy_stream for feature score D2H since critical D2H is done
+                    self._update_feature_score_metadata(
+                        linear_cache_indices=linear_cache_indices,
+                        weights=weights,
+                        d2h_stream=self.ssd_memcpy_stream,
+                        write_stream=self.feature_score_stream,
+                        pre_event_for_write=self.ssd_event_cache_evict,
+                    )
             # Generate row addresses (pointing to either L1 or the current
             # iteration's scratch pad)
@@ -1553,24 +2283,32 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     )
                 )
-            # Store scratch pad info for post backward eviction
-            self.ssd_scratch_pad_eviction_data.append(
-                (
-                    inserted_rows,
-                    post_bwd_evicted_indices_cpu,
-                    actions_count_cpu,
-                    linear_cache_indices.numel() > 0,
+            # Store scratch pad info for post backward eviction only for training
+            # for eval job, no backward pass, so no need to store this info
+            if self.training:
+                self.ssd_scratch_pad_eviction_data.append(
+                    (
+                        inserted_rows,
+                        post_bwd_evicted_indices_cpu,
+                        actions_count_cpu,
+                        linear_cache_indices.numel() > 0,
+                    )
                 )
-            )
             # Store data for forward
             self.ssd_prefetch_data.append(prefetch_data)
+            # Record an event to mark the completion of prefetch operations
+            # This will be used by direct_write_embedding to ensure it doesn't run concurrently with prefetch
+            current_stream.record_event(self.prefetch_complete_event)
     @torch.jit.ignore
     def _generate_vbe_metadata(
         self,
         offsets: Tensor,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]],
+        batch_size_per_feature_per_rank: Optional[list[list[int]]],
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -1589,19 +2327,58 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.pooling_mode,
             self.feature_dims,
             self.current_device,
+            vbe_output,
+            vbe_output_offsets,
         )
+    def _increment_iteration(self) -> int:
+        # Although self.iter_cpu is created on CPU. It might be transferred to
+        # GPU by the user. So, we need to transfer it to CPU explicitly. This
+        # should be done only once.
+        self.iter_cpu = self.iter_cpu.cpu()
+        # Sync with loaded state
+        # Wrap to make it compatible with PT2 compile
+        if not is_torchdynamo_compiling():
+            if self.iter_cpu.item() == 0:
+                self.iter_cpu.fill_(self.iter.cpu().item())
+        # Increment the iteration counter
+        # The CPU counterpart is used for local computation
+        iter_int = int(self.iter_cpu.add_(1).item())
+        # The GPU counterpart is used for checkpointing
+        self.iter.add_(1)
+        return iter_int
     def forward(
         self,
         indices: Tensor,
         offsets: Tensor,
+        weights: Optional[Tensor] = None,
         per_sample_weights: Optional[Tensor] = None,
         feature_requires_grad: Optional[Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
         # pyre-fixme[7]: Expected `Tensor` but got implicit return value of `None`.
     ) -> Tensor:
+        self.clear_cache()
+        if vbe_output is not None or vbe_output_offsets is not None:
+            # CPU is not supported in SSD TBE
+            check_allocated_vbe_output(
+                self.output_dtype,
+                batch_size_per_feature_per_rank,
+                vbe_output,
+                vbe_output_offsets,
+            )
         indices, offsets, per_sample_weights, vbe_metadata = self.prepare_inputs(
-            indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
+            indices,
+            offsets,
+            per_sample_weights,
+            batch_size_per_feature_per_rank,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
         if len(self.timesteps_prefetched) == 0:
@@ -1615,7 +2392,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 context=self.step,
                 stream=self.ssd_eviction_stream,
             ):
-                self._prefetch(indices, offsets, vbe_metadata)
+                self._prefetch(indices, offsets, weights, vbe_metadata)
         assert len(self.ssd_prefetch_data) > 0
@@ -1674,18 +2451,33 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 "post_bwd_evicted_indices": post_bwd_evicted_indices_cpu,
                 "actions_count": actions_count_cpu,
             },
+            enable_optimizer_offloading=self.enable_optimizer_offloading,
             # pyre-fixme[6]: Expected `lookup_args_ssd.VBEMetadata` but got `lookup_args.VBEMetadata`
             vbe_metadata=vbe_metadata,
+            learning_rate_tensor=self.learning_rate_tensor,
+            info_B_num_bits=self.info_B_num_bits,
+            info_B_mask=self.info_B_mask,
         )
         self.timesteps_prefetched.pop(0)
         self.step += 1
-        if self.optimizer == OptimType.EXACT_SGD:
-            raise AssertionError(
-                "SSDTableBatchedEmbeddingBags currently does not support SGD"
+        # Increment the iteration (value is used for certain optimizers)
+        iter_int = self._increment_iteration()
+        if self.optimizer in [OptimType.PARTIAL_ROWWISE_ADAM, OptimType.ADAM]:
+            momentum2 = invokers.lookup_args_ssd.Momentum(
+                # pyre-ignore[6]
+                dev=self.momentum2_dev,
+                # pyre-ignore[6]
+                host=self.momentum2_host,
+                # pyre-ignore[6]
+                uvm=self.momentum2_uvm,
+                # pyre-ignore[6]
+                offsets=self.momentum2_offsets,
+                # pyre-ignore[6]
+                placements=self.momentum2_placements,
             )
-            return invokers.lookup_sgd_ssd.invoke(common_args, self.optimizer_args)
         momentum1 = invokers.lookup_args_ssd.Momentum(
             dev=self.momentum1_dev,
@@ -1696,44 +2488,584 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         )
         if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
-            # pyre-fixme[7]: Expected `Tensor` but got implicit return value of `None`.
             return invokers.lookup_rowwise_adagrad_ssd.invoke(
                 common_args, self.optimizer_args, momentum1
             )
+        elif self.optimizer == OptimType.PARTIAL_ROWWISE_ADAM:
+            return invokers.lookup_partial_rowwise_adam_ssd.invoke(
+                common_args,
+                self.optimizer_args,
+                momentum1,
+                # pyre-ignore[61]
+                momentum2,
+                iter_int,
+            )
+        elif self.optimizer == OptimType.ADAM:
+            row_counter = invokers.lookup_args_ssd.Momentum(
+                # pyre-fixme[6]
+                dev=self.row_counter_dev,
+                # pyre-fixme[6]
+                host=self.row_counter_host,
+                # pyre-fixme[6]
+                uvm=self.row_counter_uvm,
+                # pyre-fixme[6]
+                offsets=self.row_counter_offsets,
+                # pyre-fixme[6]
+                placements=self.row_counter_placements,
+            )
+            return invokers.lookup_adam_ssd.invoke(
+                common_args,
+                self.optimizer_args,
+                momentum1,
+                # pyre-ignore[61]
+                momentum2,
+                iter_int,
+                row_counter=row_counter,
+            )
     @torch.jit.ignore
-    def debug_split_optimizer_states(self) -> List[Tuple[torch.Tensor]]:
+    def _split_optimizer_states_non_kv_zch(
+        self,
+    ) -> list[list[torch.Tensor]]:
         """
-        Returns a list of states, split by table
-        Testing only
+        Returns a list of optimizer states (view), split by table.
+        Returns:
+            A list of list of states. Shape = (the number of tables, the number
+            of states).
+            The following shows the list of states (in the returned order) for
+            each optimizer:
+            (1) `EXACT_ROWWISE_ADAGRAD`: `momentum1` (rowwise)
+            (1) `PARTIAL_ROWWISE_ADAM`: `momentum1`, `momentum2` (rowwise)
         """
-        (rows, _) = zip(*self.embedding_specs)
-        rows_cumsum = [0] + list(itertools.accumulate(rows))
+        # Row count per table
+        rows, dims = zip(*self.embedding_specs)
+        # Cumulative row counts per table for rowwise states
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
+        # Cumulative element counts per table for elementwise states
+        elem_count_cumsum: list[int] = [0] + list(
+            itertools.accumulate([r * d for r, d in self.embedding_specs])
+        )
+        # pyre-ignore[53]
+        def _slice(tensor: Tensor, t: int, rowwise: bool) -> Tensor:
+            d: int = dims[t]
+            e: int = rows[t]
+            if not rowwise:
+                # Optimizer state is element-wise - compute the table offset for
+                # the table, view the slice as 2D tensor
+                return tensor.detach()[
+                    elem_count_cumsum[t] : elem_count_cumsum[t + 1]
+                ].view(-1, d)
+            else:
+                # Optimizer state is row-wise - fetch elements in range and view
+                # slice as 1D
+                return tensor.detach()[
+                    row_count_cumsum[t] : row_count_cumsum[t + 1]
+                ].view(e)
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            return [
+                [_slice(self.momentum1_dev, t, rowwise=True)]
+                for t, _ in enumerate(rows)
+            ]
+        elif self.optimizer == OptimType.PARTIAL_ROWWISE_ADAM:
+            return [
+                [
+                    _slice(self.momentum1_dev, t, rowwise=False),
+                    # pyre-ignore[6]
+                    _slice(self.momentum2_dev, t, rowwise=True),
+                ]
+                for t, _ in enumerate(rows)
+            ]
+        elif self.optimizer == OptimType.ADAM:
+            return [
+                [
+                    _slice(self.momentum1_dev, t, rowwise=False),
+                    # pyre-ignore[6]
+                    _slice(self.momentum2_dev, t, rowwise=False),
+                ]
+                for t, _ in enumerate(rows)
+            ]
+        else:
+            raise NotImplementedError(
+                f"Getting optimizer states is not supported for {self.optimizer}"
+            )
+    @torch.jit.ignore
+    def _split_optimizer_states_kv_zch_no_offloading(
+        self,
+        sorted_ids: torch.Tensor,
+    ) -> list[list[torch.Tensor]]:
+        # Row count per table
+        rows, dims = zip(*self.embedding_specs)
+        # Cumulative row counts per table for rowwise states
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
+        # Cumulative element counts per table for elementwise states
+        elem_count_cumsum: list[int] = [0] + list(
+            itertools.accumulate([r * d for r, d in self.embedding_specs])
+        )
+        # pyre-ignore[53]
+        def _slice(state_name: str, tensor: Tensor, t: int, rowwise: bool) -> Tensor:
+            d: int = dims[t]
+            # pyre-ignore[16]
+            bucket_id_start, _ = self.kv_zch_params.bucket_offsets[t]
+            # pyre-ignore[16]
+            bucket_size = self.kv_zch_params.bucket_sizes[t]
+            if sorted_ids is None or sorted_ids[t].numel() == 0:
+                # Empty optimizer state for module initialization
+                return torch.empty(
+                    0,
+                    dtype=(
+                        self.optimizer_state_dtypes.get(
+                            state_name, SparseType.FP32
+                        ).as_dtype()
+                    ),
+                    device="cpu",
+                )
+            elif not rowwise:
+                # Optimizer state is element-wise - materialize the local ids
+                # based on the sorted_ids compute the table offset for the
+                # table, view the slice as 2D tensor of e x d, then fetch the
+                # sub-slice by local ids
+                #
+                # local_ids is [N, 1], flatten it to N to keep the returned tensor 2D
+                local_ids = (sorted_ids[t] - bucket_id_start * bucket_size).view(-1)
+                return (
+                    tensor.detach()
+                    .cpu()[elem_count_cumsum[t] : elem_count_cumsum[t + 1]]
+                    .view(-1, d)[local_ids]
+                )
+            else:
+                # Optimizer state is row-wise - materialize the local ids based
+                # on the sorted_ids and table offset (i.e. row count cumsum),
+                # then fetch by local ids
+                linearized_local_ids = (
+                    sorted_ids[t] - bucket_id_start * bucket_size + row_count_cumsum[t]
+                )
+                return tensor.detach().cpu()[linearized_local_ids].view(-1)
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            return [
+                [_slice("momentum1", self.momentum1_dev, t, rowwise=True)]
+                for t, _ in enumerate(rows)
+            ]
+        elif self.optimizer == OptimType.PARTIAL_ROWWISE_ADAM:
+            return [
+                [
+                    _slice("momentum1", self.momentum1_dev, t, rowwise=False),
+                    # pyre-ignore[6]
+                    _slice("momentum2", self.momentum2_dev, t, rowwise=True),
+                ]
+                for t, _ in enumerate(rows)
+            ]
+        elif self.optimizer == OptimType.ADAM:
+            return [
+                [
+                    _slice("momentum1", self.momentum1_dev, t, rowwise=False),
+                    # pyre-ignore[6]
+                    _slice("momentum2", self.momentum2_dev, t, rowwise=False),
+                ]
+                for t, _ in enumerate(rows)
+            ]
+        else:
+            raise NotImplementedError(
+                f"Getting optimizer states is not supported for {self.optimizer}"
+            )
+    @torch.jit.ignore
+    def _split_optimizer_states_kv_zch_w_offloading(
+        self,
+        sorted_ids: torch.Tensor,
+        no_snapshot: bool = True,
+        should_flush: bool = False,
+    ) -> list[list[torch.Tensor]]:
+        dtype = self.weights_precision.as_dtype()
+        # Row count per table
+        rows_, dims_ = zip(*self.embedding_specs)
+        # Cumulative row counts per table for rowwise states
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows_))
+        snapshot_handle, _ = self._may_create_snapshot_for_state_dict(
+            no_snapshot=no_snapshot,
+            should_flush=should_flush,
+        )
+        # pyre-ignore[53]
+        def _fetch_offloaded_optimizer_states(
+            t: int,
+        ) -> list[Tensor]:
+            e: int = rows_[t]
+            d: int = dims_[t]
+            # pyre-ignore[16]
+            bucket_id_start, _ = self.kv_zch_params.bucket_offsets[t]
+            # pyre-ignore[16]
+            bucket_size = self.kv_zch_params.bucket_sizes[t]
+            row_offset = row_count_cumsum[t] - (bucket_id_start * bucket_size)
+            # Count of rows to fetch
+            rows_to_fetch = sorted_ids[t].numel()
+            # Lookup the byte offsets for each optimizer state
+            optimizer_state_byte_offsets = self.optimizer.byte_offsets_along_row(
+                d, self.weights_precision, self.optimizer_state_dtypes
+            )
+            # Find the minimum start of all the start/end pairs - we have to
+            # offset the start/end pairs by this value to get the correct start/end
+            offset_ = min(
+                [start for _, (start, _) in optimizer_state_byte_offsets.items()]
+            )
+            # Update the start/end pairs to be relative to offset_
+            optimizer_state_byte_offsets = dict(
+                (k, (v1 - offset_, v2 - offset_))
+                for k, (v1, v2) in optimizer_state_byte_offsets.items()
+            )
+            # Since the backend returns cache rows that pack the weights and
+            # optimizer states together, reading the whole tensor could cause OOM,
+            # so we use the KVTensorWrapper abstraction to query the backend and
+            # fetch the data in chunks instead.
+            tensor_wrapper = torch.classes.fbgemm.KVTensorWrapper(
+                shape=[
+                    e,
+                    # Dim is terms of **weights** dtype
+                    self.optimizer_state_dim,
+                ],
+                dtype=dtype,
+                row_offset=row_offset,
+                snapshot_handle=snapshot_handle,
+                sorted_indices=sorted_ids[t],
+                width_offset=pad4(d),
+            )
+            (
+                tensor_wrapper.set_embedding_rocks_dp_wrapper(self.ssd_db)
+                if self.backend_type == BackendType.SSD
+                else tensor_wrapper.set_dram_db_wrapper(self.ssd_db)
+            )
+            # Fetch the state size table for the given weights domension
+            state_size_table = self.optimizer.state_size_table(d)
+            # Create a 2D output buffer of [rows x optimizer state dim] with the
+            # weights type as the type.  For optimizers with multiple states (e.g.
+            # momentum1 and momentum2), this tensor will include data from all
+            # states, hence self.optimizer_state_dim as the row size.
+            optimizer_states_buffer = torch.empty(
+                (rows_to_fetch, self.optimizer_state_dim), dtype=dtype, device="cpu"
+            )
+            # Set the chunk size for fetching
+            chunk_size = (
+                # 10M rows => 260(max_D)* 2(ele_bytes) * 10M => 5.2GB mem spike
+                10_000_000
+            )
+            logging.info(f"split optimizer chunk rows: {chunk_size}")
+            # Chunk the fetching by chunk_size
+            for i in range(0, rows_to_fetch, chunk_size):
+                length = min(chunk_size, rows_to_fetch - i)
+                # Fetch from backend and copy to the output buffer
+                optimizer_states_buffer.narrow(0, i, length).copy_(
+                    tensor_wrapper.narrow(0, i, length).view(dtype)
+                )
+            # Now split up the buffer into N views, N for each optimizer state
+            optimizer_states: list[Tensor] = []
+            for state_name in self.optimizer.state_names():
+                # Extract the offsets
+                start, end = optimizer_state_byte_offsets[state_name]
+                state = optimizer_states_buffer.view(
+                    # Force tensor to byte view
+                    dtype=torch.uint8
+                    # Copy by byte offsets
+                )[:, start:end].view(
+                    # Re-view in the state's target dtype
+                    self.optimizer_state_dtypes.get(
+                        state_name, SparseType.FP32
+                    ).as_dtype()
+                )
+                optimizer_states.append(
+                    # If the state is rowwise (i.e. just one element per row),
+                    # then re-view as 1D tensor
+                    state
+                    if state_size_table[state_name] > 1
+                    else state.view(-1)
+                )
+            # Return the views
+            return optimizer_states
         return [
             (
-                self.momentum1_dev.detach()[rows_cumsum[t] : rows_cumsum[t + 1]].view(
-                    row
-                ),
+                self.optimizer.empty_states([0], [d], self.optimizer_state_dtypes)[0]
+                # Return a set of empty states if sorted_ids[t] is empty
+                if sorted_ids is None or sorted_ids[t].numel() == 0
+                # Else fetch the list of optimizer states for the table
+                else _fetch_offloaded_optimizer_states(t)
             )
-            for t, row in enumerate(rows)
+            for t, d in enumerate(dims_)
         ]
+    @torch.jit.ignore
+    def _split_optimizer_states_kv_zch_whole_row(
+        self,
+        sorted_ids: torch.Tensor,
+        no_snapshot: bool = True,
+        should_flush: bool = False,
+    ) -> list[list[torch.Tensor]]:
+        dtype = self.weights_precision.as_dtype()
+        # Row and dimension counts per table
+        # rows_ is only used here to compute the virtual table offsets
+        rows_, dims_ = zip(*self.embedding_specs)
+        # Cumulative row counts per (virtual) table for rowwise states
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows_))
+        snapshot_handle, _ = self._may_create_snapshot_for_state_dict(
+            no_snapshot=no_snapshot,
+            should_flush=should_flush,
+        )
+        # pyre-ignore[53]
+        def _fetch_offloaded_optimizer_states(
+            t: int,
+        ) -> list[Tensor]:
+            d: int = dims_[t]
+            # pyre-ignore[16]
+            bucket_id_start, _ = self.kv_zch_params.bucket_offsets[t]
+            # pyre-ignore[16]
+            bucket_size = self.kv_zch_params.bucket_sizes[t]
+            row_offset = row_count_cumsum[t] - (bucket_id_start * bucket_size)
+            # When backend returns whole row, the optimizer will be returned as
+            # PMT directly
+            if sorted_ids[t].size(0) == 0 and self.local_weight_counts[t] > 0:
+                logging.info(
+                    f"Before opt PMT loading, resetting id tensor with {self.local_weight_counts[t]}"
+                )
+                sorted_ids[t] = torch.zeros(
+                    (self.local_weight_counts[t], 1),
+                    device=torch.device("cpu"),
+                    dtype=torch.int64,
+                )
+            # Lookup the byte offsets for each optimizer state relative to the
+            # start of the weights
+            optimizer_state_byte_offsets = self.optimizer.byte_offsets_along_row(
+                d, self.weights_precision, self.optimizer_state_dtypes
+            )
+            # Get the number of elements (of the optimizer state dtype) per state
+            optimizer_state_size_table = self.optimizer.state_size_table(d)
+            # Get metaheader dimensions in number of elements of weight dtype
+            metaheader_dim = (
+                # pyre-ignore[16]
+                self.kv_zch_params.eviction_policy.meta_header_lens[t]
+            )
+            # Now split up the buffer into N views, N for each optimizer state
+            optimizer_states: list[PartiallyMaterializedTensor] = []
+            for state_name in self.optimizer.state_names():
+                state_dtype = self.optimizer_state_dtypes.get(
+                    state_name, SparseType.FP32
+                ).as_dtype()
+                # Get the size of the state in elements of the optimizer state,
+                # in terms of the **weights** dtype
+                state_size = math.ceil(
+                    optimizer_state_size_table[state_name]
+                    * state_dtype.itemsize
+                    / dtype.itemsize
+                )
+                # Extract the offsets relative to the start of the weights (in
+                # num bytes)
+                start, _ = optimizer_state_byte_offsets[state_name]
+                # Convert the start to number of elements in terms of the
+                # **weights** dtype, then add the mmetaheader dim offset
+                start = metaheader_dim + start // dtype.itemsize
+                shape = [
+                    (
+                        sorted_ids[t].size(0)
+                        if sorted_ids is not None and sorted_ids[t].size(0) > 0
+                        else self.local_weight_counts[t]
+                    ),
+                    (
+                        # Dim is in terms of the **weights** dtype
+                        state_size
+                    ),
+                ]
+                # NOTE: We have to view using the **weights** dtype, as
+                # there is currently a bug with KVTensorWrapper where using
+                # a different dtype does not result in the same bytes being
+                # returned, e.g.
+                #
+                # KVTensorWrapper(dtype=fp32, width_offset=130, shape=[N, 1])
+                #
+                # is NOT the same as
+                #
+                # KVTensorWrapper(dtype=fp16, width_offset=260, shape=[N, 2]).view(-1).view(fp32)
+                #
+                # TODO: Fix KVTensorWrapper to support viewing data under different dtypes
+                tensor_wrapper = torch.classes.fbgemm.KVTensorWrapper(
+                    shape=shape,
+                    dtype=(
+                        # NOTE: Use the *weights* dtype
+                        dtype
+                    ),
+                    row_offset=row_offset,
+                    snapshot_handle=snapshot_handle,
+                    sorted_indices=sorted_ids[t],
+                    width_offset=(
+                        # NOTE: Width offset is in terms of **weights** dtype
+                        start
+                    ),
+                    # Optimizer written to DB with weights, so skip write here
+                    read_only=True,
+                )
+                (
+                    tensor_wrapper.set_embedding_rocks_dp_wrapper(self.ssd_db)
+                    if self.backend_type == BackendType.SSD
+                    else tensor_wrapper.set_dram_db_wrapper(self.ssd_db)
+                )
+                optimizer_states.append(
+                    PartiallyMaterializedTensor(tensor_wrapper, True)
+                )
+            # pyre-ignore [7]
+            return optimizer_states
+        return [_fetch_offloaded_optimizer_states(t) for t, _ in enumerate(dims_)]
+    @torch.jit.export
+    def split_optimizer_states(
+        self,
+        sorted_id_tensor: Optional[list[torch.Tensor]] = None,
+        no_snapshot: bool = True,
+        should_flush: bool = False,
+    ) -> list[list[torch.Tensor]]:
+        """
+        Returns a list of optimizer states split by table.
+        Since EXACT_ROWWISE_ADAGRAD has small optimizer states, we would generate
+        a full tensor for each table (shard). When other optimizer types are supported,
+        we should integrate with KVTensorWrapper (ssd_split_table_batched_embeddings.cpp)
+        to allow caller to read the optimizer states using `narrow()` in a rolling-window manner.
+        Args:
+            sorted_id_tensor (Optional[List[torch.Tensor]]): sorted id tensor by table, used to query optimizer
+            state from backend. Call should reuse the generated id tensor from weight state_dict, to guarantee
+            id consistency between weight and optimizer states.
+        """
+        # Handle the non-KVZCH case
+        if not self.kv_zch_params:
+            # If not in KV
+            return self._split_optimizer_states_non_kv_zch()
+        # Handle the loading-from-state-dict case
+        if self.load_state_dict:
+            # Initialize for checkpointing loading
+            assert (
+                self._cached_kvzch_data is not None
+                and self._cached_kvzch_data.cached_optimizer_states_per_table
+            ), "optimizer state is not initialized for load checkpointing"
+            return self._cached_kvzch_data.cached_optimizer_states_per_table
+        logging.info(
+            f"split_optimizer_states for KV ZCH: {no_snapshot=}, {should_flush=}"
+        )
+        start_time = time.time()
+        if not self.enable_optimizer_offloading:
+            # Handle the KVZCH non-optimizer-offloading case
+            optimizer_states = self._split_optimizer_states_kv_zch_no_offloading(
+                sorted_id_tensor
+            )
+        elif not self.backend_return_whole_row:
+            # Handle the KVZCH with-optimizer-offloading case
+            optimizer_states = self._split_optimizer_states_kv_zch_w_offloading(
+                sorted_id_tensor, no_snapshot, should_flush
+            )
+        else:
+            # Handle the KVZCH with-optimizer-offloading backend-whole-row case
+            optimizer_states = self._split_optimizer_states_kv_zch_whole_row(
+                sorted_id_tensor, no_snapshot, should_flush
+            )
+        logging.info(
+            f"KV ZCH tables split_optimizer_states query latency: {(time.time() - start_time) * 1000} ms, "
+            # pyre-ignore[16]
+            f"num ids list: {None if not sorted_id_tensor else [ids.numel() for ids in sorted_id_tensor]}"
+        )
+        return optimizer_states
+    @torch.jit.export
+    def get_optimizer_state(
+        self,
+        sorted_id_tensor: Optional[list[torch.Tensor]],
+        no_snapshot: bool = True,
+        should_flush: bool = False,
+    ) -> list[dict[str, torch.Tensor]]:
+        """
+        Returns a list of dictionaries of optimizer states split by table.
+        """
+        states_list: list[list[Tensor]] = self.split_optimizer_states(
+            sorted_id_tensor=sorted_id_tensor,
+            no_snapshot=no_snapshot,
+            should_flush=should_flush,
+        )
+        state_names = self.optimizer.state_names()
+        return [dict(zip(state_names, states)) for states in states_list]
     @torch.jit.export
-    def debug_split_embedding_weights(self) -> List[torch.Tensor]:
+    def debug_split_embedding_weights(self) -> list[torch.Tensor]:
         """
         Returns a list of weights, split by table.
         Testing only, very slow.
         """
-        (rows, _) = zip(*self.embedding_specs)
+        rows, _ = zip(*self.embedding_specs)
         rows_cumsum = [0] + list(itertools.accumulate(rows))
         splits = []
         get_event = torch.cuda.Event()
-        for t, (row, dim) in enumerate(self.embedding_specs):
+        for t, (row, _) in enumerate(self.embedding_specs):
             weights = torch.empty(
                 (row, self.max_D), dtype=self.weights_precision.as_dtype()
             )
@@ -1765,12 +3097,92 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         return splits
+    def clear_cache(self) -> None:
+        # clear KV ZCH cache for checkpointing
+        self._cached_kvzch_data = None
+    @torch.jit.ignore
+    # pyre-ignore [3] - do not definte snapshot class EmbeddingSnapshotHandleWrapper to avoid import dependency in other production code
+    def _may_create_snapshot_for_state_dict(
+        self,
+        no_snapshot: bool = True,
+        should_flush: bool = False,
+    ):
+        """
+        Create a rocksdb snapshot if needed.
+        """
+        start_time = time.time()
+        # Force device synchronize for now
+        torch.cuda.synchronize()
+        snapshot_handle = None
+        checkpoint_handle = None
+        if self.backend_type == BackendType.SSD:
+            # Create a rocksdb snapshot
+            if not no_snapshot:
+                # Flush L1 and L2 caches
+                self.flush(force=should_flush)
+                logging.info(
+                    f"flush latency for weight states: {(time.time() - start_time) * 1000} ms"
+                )
+                snapshot_handle = self.ssd_db.create_snapshot()
+                checkpoint_handle = self.ssd_db.get_active_checkpoint_uuid(self.step)
+                logging.info(
+                    f"created snapshot for weight states: {snapshot_handle}, latency: {(time.time() - start_time) * 1000} ms"
+                )
+        elif self.backend_type == BackendType.DRAM:
+            # if there is any ongoing eviction, lets wait until eviction is finished before state_dict
+            # so that we can reach consistent model state before/after state_dict
+            evict_wait_start_time = time.time()
+            self.ssd_db.wait_until_eviction_done()
+            logging.info(
+                f"state_dict wait for ongoing eviction: {time.time() - evict_wait_start_time} s"
+            )
+            self.flush(force=should_flush)
+        return snapshot_handle, checkpoint_handle
+    def get_embedding_dim_for_kvt(
+        self, metaheader_dim: int, emb_dim: int, is_loading_checkpoint: bool
+    ) -> int:
+        if self.load_ckpt_without_opt:
+            # For silvertorch publish, we don't want to load opt into backend due to limited cpu memory in publish host.
+            # So we need to load the whole row into state dict which loading the checkpoint in st publish, then only save weight into backend, after that
+            # backend will only have metaheader + weight.
+            # For the first loading, we need to set dim with metaheader_dim + emb_dim + optimizer_state_dim, otherwise the checkpoint loadding will throw size mismatch error
+            # after the first loading, we only need to get metaheader+weight from backend for state dict, so we can set dim with metaheader_dim + emb
+            if is_loading_checkpoint:
+                return (
+                    (
+                        metaheader_dim  # metaheader is already padded
+                        + pad4(emb_dim)
+                        + pad4(self.optimizer_state_dim)
+                    )
+                    if self.backend_return_whole_row
+                    else emb_dim
+                )
+            else:
+                return metaheader_dim + pad4(emb_dim)
+        else:
+            return (
+                (
+                    metaheader_dim  # metaheader is already padded
+                    + pad4(emb_dim)
+                    + pad4(self.optimizer_state_dim)
+                )
+                if self.backend_return_whole_row
+                else emb_dim
+            )
     @torch.jit.export
     def split_embedding_weights(
         self,
         no_snapshot: bool = True,
         should_flush: bool = False,
-    ) -> List[PartiallyMaterializedTensor]:
+    ) -> tuple[  # TODO: make this a NamedTuple for readability
+        Union[list[PartiallyMaterializedTensor], list[torch.Tensor]],
+        Optional[list[torch.Tensor]],
+        Optional[list[torch.Tensor]],
+        Optional[list[torch.Tensor]],
+    ]:
         """
         This method is intended to be used by the checkpointing engine
         only.
@@ -1784,50 +3196,454 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                                  operation, only set to True when necessary.
         Returns:
-            a list of partially materialized tensors, each representing a table
+            tuples of 3 lists, each element corresponds to a logical table
+            1st arg: partially materialized tensors, each representing a table
+            2nd arg: input id sorted in bucket id ascending order
+            3rd arg: active id count per bucket id, tensor size is [bucket_id_end - bucket_id_start]
+                    where for the i th element, we have i + bucket_id_start = global bucket id
+            4th arg: kvzch eviction metadata for each input id sorted in bucket id ascending order
         """
-        # Force device synchronize for now
-        torch.cuda.synchronize()
-        # Create a snapshot
-        if no_snapshot:
-            snapshot_handle = None
-        else:
-            if should_flush:
-                # Flush L1 and L2 caches
-                self.flush()
-            snapshot_handle = self.ssd_db.create_snapshot()
+        snapshot_handle, checkpoint_handle = self._may_create_snapshot_for_state_dict(
+            no_snapshot=no_snapshot,
+            should_flush=should_flush,
+        )
         dtype = self.weights_precision.as_dtype()
-        splits = []
+        if self.load_state_dict and self.kv_zch_params:
+            # init for checkpointing loading
+            assert (
+                self._cached_kvzch_data is not None
+            ), "weight id and bucket state are not initialized for load checkpointing"
+            return (
+                self._cached_kvzch_data.cached_weight_tensor_per_table,
+                self._cached_kvzch_data.cached_id_tensor_per_table,
+                self._cached_kvzch_data.cached_bucket_splits,
+                [],  # metadata tensor is not needed for checkpointing loading
+            )
+        start_time = time.time()
+        pmt_splits = []
+        bucket_sorted_id_splits = [] if self.kv_zch_params else None
+        active_id_cnt_per_bucket_split = [] if self.kv_zch_params else None
+        metadata_splits = [] if self.kv_zch_params else None
+        skip_metadata = False
+        table_offset = 0
+        for i, (emb_height, emb_dim) in enumerate(self.embedding_specs):
+            is_loading_checkpoint = False
+            bucket_ascending_id_tensor = None
+            bucket_t = None
+            metadata_tensor = None
+            row_offset = table_offset
+            metaheader_dim = 0
+            if self.kv_zch_params:
+                bucket_id_start, bucket_id_end = self.kv_zch_params.bucket_offsets[i]
+                # pyre-ignore
+                bucket_size = self.kv_zch_params.bucket_sizes[i]
+                metaheader_dim = (
+                    # pyre-ignore[16]
+                    self.kv_zch_params.eviction_policy.meta_header_lens[i]
+                )
-        row_offset = 0
-        for emb_height, emb_dim in self.embedding_specs:
-            tensor_wrapper = torch.classes.fbgemm.KVTensorWrapper(
-                db=self.ssd_db,
-                shape=[emb_height, emb_dim],
-                dtype=dtype,
+                # linearize with table offset
+                table_input_id_start = table_offset
+                table_input_id_end = table_offset + emb_height
+                # 1. get all keys from backend for one table
+                unordered_id_tensor = self._ssd_db.get_keys_in_range_by_snapshot(
+                    table_input_id_start,
+                    table_input_id_end,
+                    table_offset,
+                    snapshot_handle,
+                )
+                # 2. sorting keys in bucket ascending order
+                bucket_ascending_id_tensor, bucket_t = (
+                    torch.ops.fbgemm.get_bucket_sorted_indices_and_bucket_tensor(
+                        unordered_id_tensor,
+                        0,  # id--bucket hashing mode, 0 for chunk-based hashing, 1 for interleave-based hashing
+                        0,  # local bucket offset
+                        bucket_id_end - bucket_id_start,  # local bucket num
+                        bucket_size,
+                    )
+                )
+                metadata_tensor = self._ssd_db.get_kv_zch_eviction_metadata_by_snapshot(
+                    bucket_ascending_id_tensor + table_offset,
+                    torch.as_tensor(bucket_ascending_id_tensor.size(0)),
+                    snapshot_handle,
+                ).view(-1, 1)
+                # 3. convert local id back to global id
+                bucket_ascending_id_tensor.add_(bucket_id_start * bucket_size)
+                if (
+                    bucket_ascending_id_tensor.size(0) == 0
+                    and self.local_weight_counts[i] > 0
+                ):
+                    logging.info(
+                        f"before weight PMT loading, resetting id tensor with {self.local_weight_counts[i]}"
+                    )
+                    if self.global_id_per_rank[i].numel() != 0:
+                        assert (
+                            self.local_weight_counts[i]
+                            == self.global_id_per_rank[i].numel()
+                        ), f"local weight count and global id per rank size mismatch, with {self.local_weight_counts[i]} and {self.global_id_per_rank[i].numel()}"
+                        bucket_ascending_id_tensor = self.global_id_per_rank[i].to(
+                            device=torch.device("cpu"), dtype=torch.int64
+                        )
+                    else:
+                        bucket_ascending_id_tensor = torch.zeros(
+                            (self.local_weight_counts[i], 1),
+                            device=torch.device("cpu"),
+                            dtype=torch.int64,
+                        )
+                    skip_metadata = True
+                    is_loading_checkpoint = True
+                    # self.local_weight_counts[i] = 0  # Reset the count
+                # pyre-ignore [16] bucket_sorted_id_splits is not None
+                bucket_sorted_id_splits.append(bucket_ascending_id_tensor)
+                active_id_cnt_per_bucket_split.append(bucket_t)
+                if skip_metadata:
+                    metadata_splits = None
+                else:
+                    metadata_splits.append(metadata_tensor)
+                # for KV ZCH tbe, the sorted_indices is global id for checkpointing and publishing
+                # but in backend, local id is used during training, so the KVTensorWrapper need to convert global id to local id
+                # first, then linearize the local id with table offset, the formulat is x + table_offset - local_shard_offset
+                # to achieve this, the row_offset will be set to (table_offset - local_shard_offset)
+                row_offset = table_offset - (bucket_id_start * bucket_size)
+            tensor_wrapper = torch.classes.fbgemm.KVTensorWrapper(
+                shape=[
+                    (
+                        bucket_ascending_id_tensor.size(0)
+                        if bucket_ascending_id_tensor is not None
+                        else emb_height
+                    ),
+                    self.get_embedding_dim_for_kvt(
+                        metaheader_dim, emb_dim, is_loading_checkpoint
+                    ),
+                ],
+                dtype=dtype,
                 row_offset=row_offset,
                 snapshot_handle=snapshot_handle,
+                # set bucket_ascending_id_tensor to kvt wrapper, so narrow will follow the id order to return
+                # embedding weights.
+                sorted_indices=(
+                    bucket_ascending_id_tensor if self.kv_zch_params else None
+                ),
+                checkpoint_handle=checkpoint_handle,
+                only_load_weight=(
+                    True
+                    if self.load_ckpt_without_opt and is_loading_checkpoint
+                    else False
+                ),
+            )
+            (
+                tensor_wrapper.set_embedding_rocks_dp_wrapper(self.ssd_db)
+                if self.backend_type == BackendType.SSD
+                else tensor_wrapper.set_dram_db_wrapper(self.ssd_db)
+            )
+            table_offset += emb_height
+            pmt_splits.append(
+                PartiallyMaterializedTensor(
+                    tensor_wrapper,
+                    True if self.kv_zch_params else False,
+                )
+            )
+        logging.info(
+            f"split_embedding_weights latency: {(time.time() - start_time) * 1000} ms, "
+        )
+        if self.kv_zch_params is not None:
+            logging.info(
+                # pyre-ignore [16]
+                f"num ids list: {[ids.numel() for ids in bucket_sorted_id_splits]}"
+            )
+        return (
+            pmt_splits,
+            bucket_sorted_id_splits,
+            active_id_cnt_per_bucket_split,
+            metadata_splits,
+        )
+    @torch.jit.ignore
+    def _apply_state_dict_w_offloading(self) -> None:
+        # Row count per table
+        rows, _ = zip(*self.embedding_specs)
+        # Cumulative row counts per table for rowwise states
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
+        for t, _ in enumerate(self.embedding_specs):
+            # pyre-ignore [16]
+            bucket_id_start, _ = self.kv_zch_params.bucket_offsets[t]
+            # pyre-ignore [16]
+            bucket_size = self.kv_zch_params.bucket_sizes[t]
+            row_offset = row_count_cumsum[t] - bucket_id_start * bucket_size
+            # pyre-ignore [16]
+            weight_state = self._cached_kvzch_data.cached_weight_tensor_per_table[t]
+            # pyre-ignore [16]
+            opt_states = self._cached_kvzch_data.cached_optimizer_states_per_table[t]
+            self.streaming_write_weight_and_id_per_table(
+                weight_state,
+                opt_states,
+                # pyre-ignore [16]
+                self._cached_kvzch_data.cached_id_tensor_per_table[t],
+                row_offset,
+            )
+            self._cached_kvzch_data.cached_weight_tensor_per_table[t] = None
+            self._cached_kvzch_data.cached_optimizer_states_per_table[t] = None
+    @torch.jit.ignore
+    def _apply_state_dict_no_offloading(self) -> None:
+        # Row count per table
+        rows, _ = zip(*self.embedding_specs)
+        # Cumulative row counts per table for rowwise states
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
+        def copy_optimizer_state_(dst: Tensor, src: Tensor, indices: Tensor) -> None:
+            device = dst.device
+            dst.index_put_(
+                indices=(
+                    # indices is expected to be a tuple of Tensors, not Tensor
+                    indices.to(device).view(-1),
+                ),
+                values=src.to(device),
+            )
+        for t, _ in enumerate(rows):
+            # pyre-ignore [16]
+            bucket_id_start, _ = self.kv_zch_params.bucket_offsets[t]
+            # pyre-ignore [16]
+            bucket_size = self.kv_zch_params.bucket_sizes[t]
+            row_offset = row_count_cumsum[t] - bucket_id_start * bucket_size
+            # pyre-ignore [16]
+            weights = self._cached_kvzch_data.cached_weight_tensor_per_table[t]
+            # pyre-ignore [16]
+            ids = self._cached_kvzch_data.cached_id_tensor_per_table[t]
+            local_ids = ids + row_offset
+            logging.info(
+                f"applying sd for table {t} without optimizer offloading, local_ids is {local_ids}"
+            )
+            # pyre-ignore [16]
+            opt_states = self._cached_kvzch_data.cached_optimizer_states_per_table[t]
+            # Set up the plan for copying optimizer states over
+            if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+                mapping = [(opt_states[0], self.momentum1_dev)]
+            elif self.optimizer in [OptimType.PARTIAL_ROWWISE_ADAM, OptimType.ADAM]:
+                mapping = [
+                    (opt_states[0], self.momentum1_dev),
+                    (opt_states[1], self.momentum2_dev),
+                ]
+            else:
+                mapping = []
+            # Execute the plan and copy the optimizer states over
+            # pyre-ignore [6]
+            [copy_optimizer_state_(dst, src, local_ids) for (src, dst) in mapping]
+            self.ssd_db.set_cuda(
+                local_ids.view(-1),
+                weights,
+                torch.as_tensor(local_ids.size(0)),
+                1,
+                False,
+            )
+    @torch.jit.ignore
+    def apply_state_dict(self) -> None:
+        if self.backend_return_whole_row:
+            logging.info(
+                "backend_return_whole_row is enabled, no need to apply_state_dict"
+            )
+            return
+        # After checkpoint loading, the _cached_kvzch_data will be loaded from checkpoint.
+        # Caller should call this function to apply the cached states to backend.
+        if self.load_state_dict is False:
+            return
+        self.load_state_dict = False
+        assert self.kv_zch_params is not None, "apply_state_dict supports KV ZCH only"
+        assert (
+            self._cached_kvzch_data is not None
+            and self._cached_kvzch_data.cached_optimizer_states_per_table is not None
+        ), "optimizer state is not initialized for load checkpointing"
+        assert (
+            self._cached_kvzch_data.cached_weight_tensor_per_table is not None
+            and self._cached_kvzch_data.cached_id_tensor_per_table is not None
+        ), "weight and id state is not initialized for load checkpointing"
+        # Compute the number of elements of cache_dtype needed to store the
+        # optimizer state, round to the nearest 4
+        # optimizer_dim = self.optimizer.optimizer_state_size_dim(dtype)
+        # apply weight and optimizer state per table
+        if self.enable_optimizer_offloading:
+            self._apply_state_dict_w_offloading()
+        else:
+            self._apply_state_dict_no_offloading()
+        self.clear_cache()
+    @torch.jit.ignore
+    def streaming_write_weight_and_id_per_table(
+        self,
+        weight_state: torch.Tensor,
+        opt_states: list[torch.Tensor],
+        id_tensor: torch.Tensor,
+        row_offset: int,
+    ) -> None:
+        """
+        This function is used to write weight, optimizer and id to the backend using kvt wrapper.
+        to avoid over use memory, we will write the weight and id to backend in a rolling window manner
+        Args:
+            weight_state (torch.tensor): The weight state tensor to be written.
+            opt_states (torch.tensor): The optimizer state tensor(s) to be written.
+            id_tensor (torch.tensor): The id tensor to be written.
+        """
+        D = weight_state.size(1)
+        dtype = self.weights_precision.as_dtype()
+        optimizer_state_byte_offsets = self.optimizer.byte_offsets_along_row(
+            D, self.weights_precision, self.optimizer_state_dtypes
+        )
+        optimizer_state_size_table = self.optimizer.state_size_table(D)
+        kvt = torch.classes.fbgemm.KVTensorWrapper(
+            shape=[weight_state.size(0), self.cache_row_dim],
+            dtype=dtype,
+            row_offset=row_offset,
+            snapshot_handle=None,
+            sorted_indices=id_tensor,
+        )
+        (
+            kvt.set_embedding_rocks_dp_wrapper(self.ssd_db)
+            if self.backend_type == BackendType.SSD
+            else kvt.set_dram_db_wrapper(self.ssd_db)
+        )
+        # TODO: make chunk_size configurable or dynamic
+        chunk_size = 10000
+        row = weight_state.size(0)
+        for i in range(0, row, chunk_size):
+            # Construct the chunk buffer, using the weights precision as the dtype
+            length = min(chunk_size, row - i)
+            chunk_buffer = torch.empty(
+                length,
+                self.cache_row_dim,
+                dtype=dtype,
+                device="cpu",
+            )
+            # Copy the weight state over to the chunk buffer
+            chunk_buffer[:, : weight_state.size(1)] = weight_state[i : i + length, :]
+            # Copy the optimizer state(s) over to the chunk buffer
+            for o, opt_state in enumerate(opt_states):
+                # Fetch the state name based on the index
+                state_name = self.optimizer.state_names()[o]
+                # Fetch the byte offsets for the optimizer state by its name
+                start, end = optimizer_state_byte_offsets[state_name]
+                # Assume that the opt_state passed in already has dtype matching
+                # self.optimizer_state_dtypes[state_name]
+                opt_state_byteview = opt_state.view(
+                    # Force it to be 2D table, with row size matching the
+                    # optimizer state size
+                    -1,
+                    optimizer_state_size_table[state_name],
+                ).view(
+                    # Then force tensor to byte view
+                    dtype=torch.uint8
+                )
+                # Convert the chunk buffer and optimizer state to byte views
+                # Then use the start and end offsets to narrow the chunk buffer
+                # and copy opt_state over
+                chunk_buffer.view(dtype=torch.uint8)[:, start:end] = opt_state_byteview[
+                    i : i + length, :
+                ]
+            # Write chunk to KVTensor
+            kvt.set_weights_and_ids(chunk_buffer, id_tensor[i : i + length, :].view(-1))
+    @torch.jit.ignore
+    def enable_load_state_dict_mode(self) -> None:
+        if self.backend_return_whole_row:
+            logging.info(
+                "backend_return_whole_row is enabled, no need to enable load_state_dict mode"
+            )
+            return
+        # Enable load state dict mode before loading checkpoint
+        if self.load_state_dict:
+            return
+        self.load_state_dict = True
+        dtype = self.weights_precision.as_dtype()
+        _, dims = zip(*self.embedding_specs)
+        self._cached_kvzch_data = KVZCHCachedData([], [], [], [])
+        for i, _ in enumerate(self.embedding_specs):
+            # For checkpointing loading, we need to store the weight and id
+            # tensor temporarily in memory.  First check that the local_weight_counts
+            # are properly set before even initializing the optimizer states
+            assert (
+                self.local_weight_counts[i] > 0
+            ), f"local_weight_counts for table {i} is not set"
+        # pyre-ignore [16]
+        self._cached_kvzch_data.cached_optimizer_states_per_table = (
+            self.optimizer.empty_states(
+                self.local_weight_counts,
+                dims,
+                self.optimizer_state_dtypes,
+            )
+        )
+        for i, (_, emb_dim) in enumerate(self.embedding_specs):
+            # pyre-ignore [16]
+            bucket_id_start, bucket_id_end = self.kv_zch_params.bucket_offsets[i]
+            rows = self.local_weight_counts[i]
+            weight_state = torch.empty(rows, emb_dim, dtype=dtype, device="cpu")
+            # pyre-ignore [16]
+            self._cached_kvzch_data.cached_weight_tensor_per_table.append(weight_state)
+            logging.info(
+                f"for checkpoint loading, table {i}, weight_state shape is {weight_state.shape}"
+            )
+            id_tensor = torch.zeros((rows, 1), dtype=torch.int64, device="cpu")
+            # pyre-ignore [16]
+            self._cached_kvzch_data.cached_id_tensor_per_table.append(id_tensor)
+            # pyre-ignore [16]
+            self._cached_kvzch_data.cached_bucket_splits.append(
+                torch.empty(
+                    (bucket_id_end - bucket_id_start, 1),
+                    dtype=torch.int64,
+                    device="cpu",
+                )
             )
-            row_offset += emb_height
-            splits.append(PartiallyMaterializedTensor(tensor_wrapper))
-        return splits
     @torch.jit.export
     def set_learning_rate(self, lr: float) -> None:
         """
         Sets the learning rate.
+        Args:
+            lr (float): The learning rate value to set to
         """
         self._set_learning_rate(lr)
     def get_learning_rate(self) -> float:
         """
-        Sets the learning rate.
-        Args:
-            lr (float): The learning rate value to set to
+        Get and return the learning rate.
         """
-        return self.optimizer_args.learning_rate
+        return self.learning_rate_tensor.item()
     @torch.jit.ignore
     def _set_learning_rate(self, lr: float) -> float:
@@ -1835,14 +3651,30 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         Helper function to script `set_learning_rate`.
         Note that returning None does not work.
         """
-        self.optimizer_args = self.optimizer_args._replace(learning_rate=lr)
+        self.learning_rate_tensor = torch.tensor(
+            lr, device=torch.device("cpu"), dtype=torch.float32
+        )
         return 0.0
-    def flush(self) -> None:
+    def flush(self, force: bool = False) -> None:
+        # allow force flush from split_embedding_weights to cover edge cases, e.g. checkpointing
+        # after trained 0 batches
+        if not self.training:
+            # for eval mode, we should not write anything to embedding
+            return
+        if self.step == self.last_flush_step and not force:
+            logging.info(
+                f"SSD TBE has been flushed at {self.last_flush_step=} already for tbe:{self.tbe_unique_id}"
+            )
+            return
+        logging.info(
+            f"SSD TBE flush at {self.step=}, it is an expensive call please be cautious"
+        )
         active_slots_mask = self.lxu_cache_state != -1
         active_weights_gpu = self.lxu_cache_weights[active_slots_mask.view(-1)].view(
-            -1, self.max_D
+            -1, self.cache_row_dim
         )
         active_ids_gpu = self.lxu_cache_state.view(-1)[active_slots_mask.view(-1)]
@@ -1858,24 +3690,38 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             torch.tensor([active_ids_cpu.numel()]),
         )
         self.ssd_db.flush()
+        self.last_flush_step = self.step
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        """
+        Create a rocksdb hard link snapshot to provide cross procs access to the underlying data
+        """
+        if self.backend_type == BackendType.SSD:
+            self.ssd_db.create_rocksdb_hard_link_snapshot(self.step)
+        else:
+            logging.warning(
+                "create_rocksdb_hard_link_snapshot is only supported for SSD backend"
+            )
     def prepare_inputs(
         self,
         indices: Tensor,
         offsets: Tensor,
         per_sample_weights: Optional[Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
-    ) -> Tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
+    ) -> tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs
         """
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
-            offsets, batch_size_per_feature_per_rank
+            offsets, batch_size_per_feature_per_rank, vbe_output, vbe_output_offsets
         )
         # Force casting indices and offsets to long
-        (indices, offsets) = indices.long(), offsets.long()
+        indices, offsets = indices.long(), offsets.long()
         # Force casting per_sample_weights to float
         if per_sample_weights is not None:
@@ -1891,12 +3737,13 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 per_sample_weights,
                 B_offsets=vbe_metadata.B_offsets,
                 max_B=vbe_metadata.max_B,
+                bounds_check_version=self.bounds_check_version,
             )
         return indices, offsets, per_sample_weights, vbe_metadata
     @torch.jit.ignore
-    def _report_ssd_stats(self) -> None:
+    def _report_kv_backend_stats(self) -> None:
         """
         All ssd stats report function entrance
         """
@@ -1906,9 +3753,15 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         if not self.stats_reporter.should_report(self.step):
             return
         self._report_ssd_l1_cache_stats()
-        self._report_ssd_io_stats()
-        self._report_ssd_mem_usage()
-        self._report_l2_cache_perf_stats()
+        if self.backend_type == BackendType.SSD:
+            self._report_ssd_io_stats()
+            self._report_ssd_mem_usage()
+            self._report_l2_cache_perf_stats()
+        if self.backend_type == BackendType.DRAM:
+            self._report_dram_kv_perf_stats()
+            if self.kv_zch_params and self.kv_zch_params.eviction_policy:
+                self._report_eviction_stats()
     @torch.jit.ignore
     def _report_ssd_l1_cache_stats(self) -> None:
@@ -1925,7 +3778,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         ssd_cache_stats = self.ssd_cache_stats.tolist()
         if len(self.last_reported_ssd_stats) == 0:
             self.last_reported_ssd_stats = [0.0] * len(ssd_cache_stats)
-        ssd_cache_stats_delta: List[float] = [0.0] * len(ssd_cache_stats)
+        ssd_cache_stats_delta: list[float] = [0.0] * len(ssd_cache_stats)
         for i in range(len(ssd_cache_stats)):
             ssd_cache_stats_delta[i] = (
                 ssd_cache_stats[i] - self.last_reported_ssd_stats[i]
@@ -1942,11 +3795,11 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 data_bytes=int(
                     ssd_cache_stats_delta[stat_index.value]
                     * element_size
-                    * self.max_D
+                    * self.cache_row_dim
                     / passed_steps
                 ),
             )
-            # pyre-ignore
             self.stats_reporter.report_data_amount(
                 iteration_step=self.step,
                 event_name=f"ssd_tbe.prefetch.cache_stats.{stat_index.name.lower()}",
@@ -1973,35 +3826,35 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         bwd_l1_cnflct_miss_write_back_dur = ssd_io_duration[3]
         flush_write_dur = ssd_io_duration[4]
-        # pyre-ignore
+        # pyre-ignore [16]
         self.stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="ssd.io_duration.read_us",
             duration_ms=ssd_read_dur_us,
             time_unit="us",
         )
-        # pyre-ignore
         self.stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="ssd.io_duration.write.fwd_rocksdb_read_us",
             duration_ms=fwd_rocksdb_read_dur,
             time_unit="us",
         )
-        # pyre-ignore
         self.stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="ssd.io_duration.write.fwd_l1_eviction_us",
             duration_ms=fwd_l1_eviction_dur,
             time_unit="us",
         )
-        # pyre-ignore
         self.stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="ssd.io_duration.write.bwd_l1_cnflct_miss_write_back_us",
             duration_ms=bwd_l1_cnflct_miss_write_back_dur,
             time_unit="us",
         )
-        # pyre-ignore
         self.stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="ssd.io_duration.write.flush_write_us",
@@ -2023,25 +3876,25 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         memtable_usage = mem_usage_list[2]
         block_cache_pinned_usage = mem_usage_list[3]
-        # pyre-ignore
+        # pyre-ignore [16]
         self.stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="ssd.mem_usage.block_cache",
             data_bytes=block_cache_usage,
         )
-        # pyre-ignore
         self.stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="ssd.mem_usage.estimate_table_reader",
             data_bytes=estimate_table_reader_usage,
         )
-        # pyre-ignore
         self.stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="ssd.mem_usage.memtable",
             data_bytes=memtable_usage,
         )
-        # pyre-ignore
         self.stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="ssd.mem_usage.block_cache_pinned",
@@ -2175,7 +4028,408 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             time_unit="us",
         )
-    # pyre-ignore
+    @torch.jit.ignore
+    def _report_eviction_stats(self) -> None:
+        if self.stats_reporter is None:
+            return
+        stats_reporter: TBEStatsReporter = self.stats_reporter
+        if not stats_reporter.should_report(self.step):
+            return
+        # skip metrics reporting when evicting disabled
+        if self.kv_zch_params.eviction_policy.eviction_trigger_mode == 0:
+            return
+        T = len(set(self.feature_table_map))
+        evicted_counts = torch.zeros(T, dtype=torch.int64)
+        processed_counts = torch.zeros(T, dtype=torch.int64)
+        eviction_threshold_with_dry_run = torch.zeros(T, dtype=torch.float)
+        full_duration_ms = torch.tensor(0, dtype=torch.int64)
+        exec_duration_ms = torch.tensor(0, dtype=torch.int64)
+        self.ssd_db.get_feature_evict_metric(
+            evicted_counts,
+            processed_counts,
+            eviction_threshold_with_dry_run,
+            full_duration_ms,
+            exec_duration_ms,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.eviction_sum_evicted_counts_stats_name,
+            data_bytes=int(evicted_counts.sum().item()),
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.eviction_sum_processed_counts_stats_name,
+            data_bytes=int(processed_counts.sum().item()),
+            enable_tb_metrics=True,
+        )
+        if processed_counts.sum().item() != 0:
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name=self.eviction_evict_rate_stats_name,
+                data_bytes=int(
+                    evicted_counts.sum().item() * 100 / processed_counts.sum().item()
+                ),
+                enable_tb_metrics=True,
+            )
+        for t in self.feature_table_map:
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name=f"eviction.feature_table.{t}.evicted_counts",
+                data_bytes=int(evicted_counts[t].item()),
+                enable_tb_metrics=True,
+            )
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name=f"eviction.feature_table.{t}.processed_counts",
+                data_bytes=int(processed_counts[t].item()),
+                enable_tb_metrics=True,
+            )
+            if processed_counts[t].item() != 0:
+                stats_reporter.report_data_amount(
+                    iteration_step=self.step,
+                    event_name=f"eviction.feature_table.{t}.evict_rate",
+                    data_bytes=int(
+                        evicted_counts[t].item() * 100 / processed_counts[t].item()
+                    ),
+                    enable_tb_metrics=True,
+                )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="eviction.feature_table.full_duration_ms",
+            duration_ms=full_duration_ms.item(),
+            time_unit="ms",
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="eviction.feature_table.exec_duration_ms",
+            duration_ms=exec_duration_ms.item(),
+            time_unit="ms",
+            enable_tb_metrics=True,
+        )
+        if full_duration_ms.item() != 0:
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name="eviction.feature_table.exec_div_full_duration_rate",
+                data_bytes=int(exec_duration_ms.item() * 100 / full_duration_ms.item()),
+                enable_tb_metrics=True,
+            )
+    @torch.jit.ignore
+    def _report_dram_kv_perf_stats(self) -> None:
+        """
+        EmbeddingKVDB will hold stats for DRAM cache performance in fwd/bwd
+        this function fetch the stats from EmbeddingKVDB and report it with stats_reporter
+        """
+        if self.stats_reporter is None:
+            return
+        stats_reporter: TBEStatsReporter = self.stats_reporter
+        if not stats_reporter.should_report(self.step):
+            return
+        dram_kv_perf_stats = self.ssd_db.get_dram_kv_perf(
+            self.step, stats_reporter.report_interval  # pyre-ignore
+        )
+        if len(dram_kv_perf_stats) != 36:
+            logging.error("dram cache perf stats should have 36 elements")
+            return
+        dram_read_duration = dram_kv_perf_stats[0]
+        dram_read_sharding_duration = dram_kv_perf_stats[1]
+        dram_read_cache_hit_copy_duration = dram_kv_perf_stats[2]
+        dram_read_fill_row_storage_duration = dram_kv_perf_stats[3]
+        dram_read_lookup_cache_duration = dram_kv_perf_stats[4]
+        dram_read_acquire_lock_duration = dram_kv_perf_stats[5]
+        dram_read_missing_load = dram_kv_perf_stats[6]
+        dram_write_sharing_duration = dram_kv_perf_stats[7]
+        dram_fwd_l1_eviction_write_duration = dram_kv_perf_stats[8]
+        dram_fwd_l1_eviction_write_allocate_duration = dram_kv_perf_stats[9]
+        dram_fwd_l1_eviction_write_cache_copy_duration = dram_kv_perf_stats[10]
+        dram_fwd_l1_eviction_write_lookup_cache_duration = dram_kv_perf_stats[11]
+        dram_fwd_l1_eviction_write_acquire_lock_duration = dram_kv_perf_stats[12]
+        dram_fwd_l1_eviction_write_missing_load = dram_kv_perf_stats[13]
+        dram_bwd_l1_cnflct_miss_write_duration = dram_kv_perf_stats[14]
+        dram_bwd_l1_cnflct_miss_write_allocate_duration = dram_kv_perf_stats[15]
+        dram_bwd_l1_cnflct_miss_write_cache_copy_duration = dram_kv_perf_stats[16]
+        dram_bwd_l1_cnflct_miss_write_lookup_cache_duration = dram_kv_perf_stats[17]
+        dram_bwd_l1_cnflct_miss_write_acquire_lock_duration = dram_kv_perf_stats[18]
+        dram_bwd_l1_cnflct_miss_write_missing_load = dram_kv_perf_stats[19]
+        dram_kv_allocated_bytes = dram_kv_perf_stats[20]
+        dram_kv_actual_used_chunk_bytes = dram_kv_perf_stats[21]
+        dram_kv_num_rows = dram_kv_perf_stats[22]
+        dram_kv_read_counts = dram_kv_perf_stats[23]
+        dram_metadata_write_sharding_total_duration = dram_kv_perf_stats[24]
+        dram_metadata_write_total_duration = dram_kv_perf_stats[25]
+        dram_metadata_write_allocate_avg_duration = dram_kv_perf_stats[26]
+        dram_metadata_write_lookup_cache_avg_duration = dram_kv_perf_stats[27]
+        dram_metadata_write_acquire_lock_avg_duration = dram_kv_perf_stats[28]
+        dram_metadata_write_cache_miss_avg_count = dram_kv_perf_stats[29]
+        dram_read_metadata_total_duration = dram_kv_perf_stats[30]
+        dram_read_metadata_sharding_total_duration = dram_kv_perf_stats[31]
+        dram_read_metadata_cache_hit_copy_avg_duration = dram_kv_perf_stats[32]
+        dram_read_metadata_lookup_cache_total_avg_duration = dram_kv_perf_stats[33]
+        dram_read_metadata_acquire_lock_avg_duration = dram_kv_perf_stats[34]
+        dram_read_read_metadata_load_size = dram_kv_perf_stats[35]
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_read_duration_us",
+            duration_ms=dram_read_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_read_sharding_duration_us",
+            duration_ms=dram_read_sharding_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_read_cache_hit_copy_duration_us",
+            duration_ms=dram_read_cache_hit_copy_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_read_fill_row_storage_duration_us",
+            duration_ms=dram_read_fill_row_storage_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_read_lookup_cache_duration_us",
+            duration_ms=dram_read_lookup_cache_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_read_acquire_lock_duration_us",
+            duration_ms=dram_read_acquire_lock_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_read_missing_load",
+            enable_tb_metrics=True,
+            data_bytes=dram_read_missing_load,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_write_sharing_duration_us",
+            duration_ms=dram_write_sharing_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_duration_us",
+            duration_ms=dram_fwd_l1_eviction_write_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_allocate_duration_us",
+            duration_ms=dram_fwd_l1_eviction_write_allocate_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_cache_copy_duration_us",
+            duration_ms=dram_fwd_l1_eviction_write_cache_copy_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_lookup_cache_duration_us",
+            duration_ms=dram_fwd_l1_eviction_write_lookup_cache_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_acquire_lock_duration_us",
+            duration_ms=dram_fwd_l1_eviction_write_acquire_lock_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_missing_load",
+            data_bytes=dram_fwd_l1_eviction_write_missing_load,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_duration_us",
+            duration_ms=dram_bwd_l1_cnflct_miss_write_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_allocate_duration_us",
+            duration_ms=dram_bwd_l1_cnflct_miss_write_allocate_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_cache_copy_duration_us",
+            duration_ms=dram_bwd_l1_cnflct_miss_write_cache_copy_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_lookup_cache_duration_us",
+            duration_ms=dram_bwd_l1_cnflct_miss_write_lookup_cache_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_acquire_lock_duration_us",
+            duration_ms=dram_bwd_l1_cnflct_miss_write_acquire_lock_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_missing_load",
+            data_bytes=dram_bwd_l1_cnflct_miss_write_missing_load,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_kv_read_counts",
+            data_bytes=dram_kv_read_counts,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.dram_kv_allocated_bytes_stats_name,
+            data_bytes=dram_kv_allocated_bytes,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.dram_kv_actual_used_chunk_bytes_stats_name,
+            data_bytes=dram_kv_actual_used_chunk_bytes,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.dram_kv_mem_num_rows_stats_name,
+            data_bytes=dram_kv_num_rows,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_sharding_total_duration_us",
+            duration_ms=dram_metadata_write_sharding_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_total_duration_us",
+            duration_ms=dram_metadata_write_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_allocate_avg_duration_us",
+            duration_ms=dram_metadata_write_allocate_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_lookup_cache_avg_duration_us",
+            duration_ms=dram_metadata_write_lookup_cache_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_acquire_lock_avg_duration_us",
+            duration_ms=dram_metadata_write_acquire_lock_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_cache_miss_avg_count",
+            data_bytes=dram_metadata_write_cache_miss_avg_count,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_total_duration_us",
+            duration_ms=dram_read_metadata_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_sharding_total_duration_us",
+            duration_ms=dram_read_metadata_sharding_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_cache_hit_copy_avg_duration_us",
+            duration_ms=dram_read_metadata_cache_hit_copy_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_lookup_cache_total_avg_duration_us",
+            duration_ms=dram_read_metadata_lookup_cache_total_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_acquire_lock_avg_duration_us",
+            duration_ms=dram_read_metadata_acquire_lock_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_load_size",
+            data_bytes=dram_read_read_metadata_load_size,
+            enable_tb_metrics=True,
+        )
     def _recording_to_timer(
         self, timer: Optional[AsyncSeriesTimer], **kwargs: Any
     ) -> Any:
@@ -2191,3 +4445,484 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             return timer.recording(**kwargs)
         # No-Op context manager
         return contextlib.nullcontext()
+    def fetch_from_l1_sp_w_row_ids(
+        self, row_ids: torch.Tensor, only_get_optimizer_states: bool = False
+    ) -> tuple[list[torch.Tensor], torch.Tensor]:
+        """
+        Fetch the optimizer states and/or weights from L1 and SP for given linearized row_ids.
+        @return: updated_weights/optimizer_states, mask of which rows are filled
+        """
+        if not self.enable_optimizer_offloading and only_get_optimizer_states:
+            raise RuntimeError(
+                "Optimizer states are not offloaded, while only_get_optimizer_states is True"
+            )
+        # NOTE: Remove this once there is support for fetching multiple
+        # optimizer states in fetch_from_l1_sp_w_row_ids
+        if only_get_optimizer_states and self.optimizer not in [
+            OptimType.EXACT_ROWWISE_ADAGRAD,
+            OptimType.PARTIAL_ROWWISE_ADAM,
+        ]:
+            raise RuntimeError(
+                f"Fetching optimizer states using fetch_from_l1_sp_w_row_ids() is not yet supported for {self.optimizer}"
+            )
+        def split_results_by_opt_states(
+            updated_weights: torch.Tensor, cache_location_mask: torch.Tensor
+        ) -> tuple[list[torch.Tensor], torch.Tensor]:
+            if not only_get_optimizer_states:
+                return [updated_weights], cache_location_mask
+            # TODO: support mixed dimension case
+            # currently only supports tables with the same max_D dimension
+            opt_to_dim = self.optimizer.byte_offsets_along_row(
+                self.max_D, self.weights_precision, self.optimizer_state_dtypes
+            )
+            updated_opt_states = []
+            for opt_name, dim in opt_to_dim.items():
+                opt_dtype = self.optimizer._extract_dtype(
+                    self.optimizer_state_dtypes, opt_name
+                )
+                updated_opt_states.append(
+                    updated_weights.view(dtype=torch.uint8)[:, dim[0] : dim[1]].view(
+                        dtype=opt_dtype
+                    )
+                )
+            return updated_opt_states, cache_location_mask
+        with torch.no_grad():
+            weights_dtype = self.weights_precision.as_dtype()
+            step = self.step
+            with record_function(f"## fetch_from_l1_{step}_{self.tbe_unique_id} ##"):
+                lxu_cache_locations: torch.Tensor = torch.ops.fbgemm.lxu_cache_lookup(
+                    row_ids,
+                    self.lxu_cache_state,
+                    self.total_hash_size,
+                )
+                updated_weights = torch.empty(
+                    row_ids.numel(),
+                    self.cache_row_dim,
+                    device=self.current_device,
+                    dtype=weights_dtype,
+                )
+                # D2D copy cache
+                cache_location_mask = lxu_cache_locations >= 0
+                torch.ops.fbgemm.masked_index_select(
+                    updated_weights,
+                    lxu_cache_locations,
+                    self.lxu_cache_weights,
+                    torch.tensor(
+                        [row_ids.numel()],
+                        device=self.current_device,
+                        dtype=torch.int32,
+                    ),
+                )
+            with record_function(f"## fetch_from_sp_{step}_{self.tbe_unique_id} ##"):
+                if len(self.ssd_scratch_pad_eviction_data) > 0:
+                    sp = self.ssd_scratch_pad_eviction_data[0][0]
+                    sp_idx = self.ssd_scratch_pad_eviction_data[0][1].to(
+                        self.current_device
+                    )
+                    actions_count_gpu = self.ssd_scratch_pad_eviction_data[0][2][0]
+                    if actions_count_gpu.item() == 0:
+                        # no action to take
+                        return split_results_by_opt_states(
+                            updated_weights, cache_location_mask
+                        )
+                    sp_idx = sp_idx[:actions_count_gpu]
+                    # -1 in lxu_cache_locations means the row is not in L1 cache and in SP
+                    # fill the row_ids in L1 with -2, >0 values means in SP
+                    # @eg. updated_row_ids_in_sp= [1, 100, 1, 2, -2, 3, 4, 5, 10]
+                    updated_row_ids_in_sp = row_ids.masked_fill(
+                        lxu_cache_locations != -1, -2
+                    )
+                    # sort the sp_idx for binary search
+                    # should already be sorted
+                    # sp_idx_inverse_indices is the indices before sorting which is same to the location in SP.
+                    # @eg. sp_idx = [4, 2, 1, 3, 10]
+                    # @eg sorted_sp_idx = [ 1,  2,  3,  4, 10] and sp_idx_inverse_indices = [2, 1, 3, 0, 4]
+                    sorted_sp_idx, sp_idx_inverse_indices = torch.sort(sp_idx)
+                    # search rows id in sp against the SP indexes to find location of the rows in SP
+                    # @eg: updated_ids_in_sp_idx = [0, 5, 0, 1, 0, 2, 3, 4, 4]
+                    # @eg: 5 is OOB
+                    updated_ids_in_sp_idx = torch.searchsorted(
+                        sorted_sp_idx, updated_row_ids_in_sp
+                    )
+                    # does not found in SP will Out of Bound
+                    oob_sp_idx = updated_ids_in_sp_idx >= sp_idx.numel()
+                    # make the oob items in bound
+                    # @eg updated_ids_in_sp_idx=[0, 0, 0, 1, 0, 2, 3, 4, 4]
+                    updated_ids_in_sp_idx[oob_sp_idx] = 0
+                    # -1s locations will be filtered out in masked_index_select
+                    sp_locations_in_updated_weights = torch.full_like(
+                        updated_row_ids_in_sp, -1
+                    )
+                    # torch.searchsorted is not exact match,
+                    # we only take exact matched rows, where the id is found in SP.
+                    # @eg 5 in updated_row_ids_in_sp is not in sp_idx, but has 4 in updated_ids_in_sp_idx
+                    # @eg sorted_sp_idx[updated_ids_in_sp_idx]=[ 1,  1,  1,  2,  1,  3,  4, 10, 10]
+                    # @eg exact_match_mask=[ True, False,  True,  True, False,  True,  True, False,  True]
+                    exact_match_mask = (
+                        sorted_sp_idx[updated_ids_in_sp_idx] == updated_row_ids_in_sp
+                    )
+                    # Get the location of the row ids found in SP.
+                    # @eg: sp_locations_found=[2, 2, 1, 3, 0, 4]
+                    sp_locations_found = sp_idx_inverse_indices[
+                        updated_ids_in_sp_idx[exact_match_mask]
+                    ]
+                    # @eg: sp_locations_in_updated_weights=[ 2, -1,  2,  1, -1,  3,  0, -1,  4]
+                    sp_locations_in_updated_weights[exact_match_mask] = (
+                        sp_locations_found
+                    )
+                    # D2D copy SP
+                    torch.ops.fbgemm.masked_index_select(
+                        updated_weights,
+                        sp_locations_in_updated_weights,
+                        sp,
+                        torch.tensor(
+                            [row_ids.numel()],
+                            device=self.current_device,
+                            dtype=torch.int32,
+                        ),
+                    )
+                    # cache_location_mask is the mask of rows in L1
+                    # exact_match_mask is the mask of rows in SP
+                    cache_location_mask = torch.logical_or(
+                        cache_location_mask, exact_match_mask
+                    )
+            return split_results_by_opt_states(updated_weights, cache_location_mask)
+    def register_backward_hook_before_eviction(
+        self, backward_hook: Callable[[torch.Tensor], None]
+    ) -> None:
+        """
+        Register a backward hook to the TBE module.
+        And make sure this is called before the sp eviction hook.
+        """
+        # make sure this hook is the first one to be executed
+        hooks = []
+        backward_hooks = self.placeholder_autograd_tensor._backward_hooks
+        if backward_hooks is not None:
+            for _handle_id, hook in backward_hooks.items():
+                hooks.append(hook)
+            backward_hooks.clear()
+        self.placeholder_autograd_tensor.register_hook(backward_hook)
+        for hook in hooks:
+            self.placeholder_autograd_tensor.register_hook(hook)
+    def set_local_weight_counts_for_table(
+        self, table_idx: int, weight_count: int
+    ) -> None:
+        self.local_weight_counts[table_idx] = weight_count
+    def set_global_id_per_rank_for_table(
+        self, table_idx: int, global_id: torch.Tensor
+    ) -> None:
+        self.global_id_per_rank[table_idx] = global_id
+    def direct_write_embedding(
+        self,
+        indices: torch.Tensor,
+        offsets: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> None:
+        """
+        Directly write the weights to L1, SP and backend without relying on auto-gradient for embedding cache.
+        Please refer to design doc for more details: https://docs.google.com/document/d/1TJHKvO1m3-5tYAKZGhacXnGk7iCNAzz7wQlrFbX_LDI/edit?tab=t.0
+        """
+        assert (
+            self._embedding_cache_mode
+        ), "Must be in embedding_cache_mode to support direct_write_embedding method."
+        B_offsets = None
+        max_B = -1
+        with torch.no_grad():
+            # Wait for any ongoing prefetch operations to complete before starting direct_write
+            current_stream = torch.cuda.current_stream()
+            current_stream.wait_event(self.prefetch_complete_event)
+            # Create local step events for internal sequential execution
+            weights_dtype = self.weights_precision.as_dtype()
+            assert (
+                weights_dtype == weights.dtype
+            ), f"Expected embedding table dtype {weights_dtype} is same with input weight dtype, but got {weights.dtype}"
+            # Pad the weights to match self.max_D width if necessary
+            if weights.size(1) < self.cache_row_dim:
+                weights = torch.nn.functional.pad(
+                    weights, (0, self.cache_row_dim - weights.size(1))
+                )
+            step = self.step
+            # step 0: run backward hook for prefetch if prefetch pipeline is enabled before writing to L1 and SP
+            if self.prefetch_pipeline:
+                self._update_cache_counter_and_pointers(nn.Module(), torch.empty(0))
+            # step 1: lookup and write to l1 cache
+            with record_function(
+                f"## direct_write_to_l1_{step}_{self.tbe_unique_id} ##"
+            ):
+                if self.gather_ssd_cache_stats:
+                    self.local_ssd_cache_stats.zero_()
+                # Linearize indices
+                linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
+                    self.hash_size_cumsum,
+                    indices,
+                    offsets,
+                    B_offsets,
+                    max_B,
+                )
+                lxu_cache_locations: torch.Tensor = torch.ops.fbgemm.lxu_cache_lookup(
+                    linear_cache_indices,
+                    self.lxu_cache_state,
+                    self.total_hash_size,
+                )
+                cache_location_mask = lxu_cache_locations >= 0
+                # Get the cache locations for the row_ids that are already in the cache
+                cache_locations = lxu_cache_locations[cache_location_mask]
+                # Get the corresponding input weights for these row_ids
+                cache_weights = weights[cache_location_mask]
+                # Update the cache with these input weights
+                if cache_locations.numel() > 0:
+                    self.lxu_cache_weights.index_put_(
+                        (cache_locations,), cache_weights, accumulate=False
+                    )
+                # Record completion of step 1
+                current_stream.record_event(self.direct_write_l1_complete_event)
+            # step 2: pop the current scratch pad and write to next batch scratch pad if exists
+            # Wait for step 1 to complete
+            with record_function(
+                f"## direct_write_to_sp_{step}_{self.tbe_unique_id} ##"
+            ):
+                if len(self.ssd_scratch_pad_eviction_data) > 0:
+                    self.ssd_scratch_pad_eviction_data.pop(0)
+                    if len(self.ssd_scratch_pad_eviction_data) > 0:
+                        # Wait for any pending backend reads to the next scratch pad
+                        # to complete before we write to it. Otherwise, stale backend data
+                        # will overwrite our direct_write updates.
+                        # The ssd_event_get marks completion of backend fetch operations.
+                        current_stream.wait_event(self.ssd_event_get)
+                        # if scratch pad exists, write to next batch scratch pad
+                        sp = self.ssd_scratch_pad_eviction_data[0][0]
+                        sp_idx = self.ssd_scratch_pad_eviction_data[0][1].to(
+                            self.current_device
+                        )
+                        actions_count_gpu = self.ssd_scratch_pad_eviction_data[0][2][0]
+                        if actions_count_gpu.item() != 0:
+                            # when no actional_count_gpu, no need to write to SP
+                            sp_idx = sp_idx[:actions_count_gpu]
+                            # -1 in lxu_cache_locations means the row is not in L1 cache and in SP
+                            # fill the row_ids in L1 with -2, >0 values means in SP or backend
+                            # @eg. updated_indices_in_sp= [1, 100, 1, 2, -2, 3, 4, 5, 10]
+                            updated_indices_in_sp = linear_cache_indices.masked_fill(
+                                lxu_cache_locations != -1, -2
+                            )
+                            # sort the sp_idx for binary search
+                            # should already be sorted
+                            # sp_idx_inverse_indices is the indices before sorting which is same to the location in SP.
+                            # @eg. sp_idx = [4, 2, 1, 3, 10]
+                            # @eg sorted_sp_idx = [ 1,  2,  3,  4, 10] and sp_idx_inverse_indices = [2, 1, 3, 0, 4]
+                            sorted_sp_idx, sp_idx_inverse_indices = torch.sort(sp_idx)
+                            # search rows id in sp against the SP indexes to find location of the rows in SP
+                            # @eg: updated_indices_in_sp = [0, 5, 0, 1, 0, 2, 3, 4, 4]
+                            # @eg: 5 is OOB
+                            updated_indices_in_sp_idx = torch.searchsorted(
+                                sorted_sp_idx, updated_indices_in_sp
+                            )
+                            # does not found in SP will Out of Bound
+                            oob_sp_idx = updated_indices_in_sp_idx >= sp_idx.numel()
+                            # make the oob items in bound
+                            # @eg updated_indices_in_sp=[0, 0, 0, 1, 0, 2, 3, 4, 4]
+                            updated_indices_in_sp_idx[oob_sp_idx] = 0
+                            # torch.searchsorted is not exact match,
+                            # we only take exact matched rows, where the id is found in SP.
+                            # @eg 5 in updated_indices_in_sp is not in sp_idx, but has 4 in updated_indices_in_sp
+                            # @eg sorted_sp_idx[updated_indices_in_sp]=[ 1,  1,  1,  2,  1,  3,  4, 10, 10]
+                            # @eg exact_match_mask=[ True, False,  True,  True, False,  True,  True, False,  True]
+                            exact_match_mask = (
+                                sorted_sp_idx[updated_indices_in_sp_idx]
+                                == updated_indices_in_sp
+                            )
+                            # Get the location of the row ids found in SP.
+                            # @eg: sp_locations_found=[2, 2, 1, 3, 0, 4]
+                            sp_locations_found = sp_idx_inverse_indices[
+                                updated_indices_in_sp[exact_match_mask]
+                            ]
+                            # Get the corresponding weights for the matched indices
+                            matched_weights = weights[exact_match_mask]
+                            # Write the weights to the sparse tensor at the found locations
+                            if sp_locations_found.numel() > 0:
+                                sp.index_put_(
+                                    (sp_locations_found,),
+                                    matched_weights,
+                                    accumulate=False,
+                                )
+                current_stream.record_event(self.direct_write_sp_complete_event)
+            # step 3: write l1 cache missing rows to backend
+            # Wait for step 2 to complete
+            with record_function(
+                f"## direct_write_to_backend_{step}_{self.tbe_unique_id} ##"
+            ):
+                # Use the existing ssd_eviction_stream for all backend write operations
+                # This stream is already created with low priority during initialization
+                with torch.cuda.stream(self.ssd_eviction_stream):
+                    # Create a mask for indices not in L1 cache
+                    non_cache_mask = ~cache_location_mask
+                    # Calculate the count of valid indices (those not in L1 cache)
+                    valid_count = non_cache_mask.sum().to(torch.int64).cpu()
+                    if valid_count.item() > 0:
+                        # Extract only the indices and weights that are not in L1 cache
+                        non_cache_indices = linear_cache_indices[non_cache_mask]
+                        non_cache_weights = weights[non_cache_mask]
+                        # Move tensors to CPU for set_cuda
+                        cpu_indices = non_cache_indices.cpu()
+                        cpu_weights = non_cache_weights.cpu()
+                        # Write to backend - only sending the non-cache indices and weights
+                        self.record_function_via_dummy_profile(
+                            f"## ssd_write_{step}_set_cuda_{self.tbe_unique_id} ##",
+                            self.ssd_db.set_cuda,
+                            cpu_indices,
+                            cpu_weights,
+                            valid_count,
+                            self.timestep,
+                            is_bwd=False,
+                        )
+                # Return control to the main stream without waiting for the backend operation to complete
+    def get_free_cpu_memory_gb(self) -> float:
+        def _get_mem_available() -> float:
+            if sys.platform.startswith("linux"):
+                info = {}
+                with open("/proc/meminfo") as f:
+                    for line in f:
+                        p = line.split()
+                        info[p[0].strip(":").lower()] = int(p[1]) * 1024
+                if "memavailable" in info:
+                    # Linux >= 3.14
+                    return info["memavailable"]
+                else:
+                    return info["memfree"] + info["cached"]
+            else:
+                raise RuntimeError(
+                    "Unsupported platform for free memory eviction, pls use ID count eviction tirgger mode"
+                )
+        mem = _get_mem_available()
+        return mem / (1024**3)
+    @classmethod
+    def trigger_evict_in_all_tbes(cls) -> None:
+        for tbe in cls._all_tbe_instances:
+            tbe.ssd_db.trigger_feature_evict()
+    @classmethod
+    def tbe_has_ongoing_eviction(cls) -> bool:
+        for tbe in cls._all_tbe_instances:
+            if tbe.ssd_db.is_evicting():
+                return True
+        return False
+    def set_free_mem_eviction_trigger_config(
+        self, eviction_policy: EvictionPolicy
+    ) -> None:
+        self.enable_free_mem_trigger_eviction = True
+        self.eviction_trigger_mode: int = eviction_policy.eviction_trigger_mode
+        assert (
+            eviction_policy.eviction_free_mem_check_interval_batch is not None
+        ), "eviction_free_mem_check_interval_batch is unexpected none for free_mem eviction trigger mode"
+        self.eviction_free_mem_check_interval_batch: int = (
+            eviction_policy.eviction_free_mem_check_interval_batch
+        )
+        assert (
+            eviction_policy.eviction_free_mem_threshold_gb is not None
+        ), "eviction_policy.eviction_free_mem_threshold_gb is unexpected none for free_mem eviction trigger mode"
+        self.eviction_free_mem_threshold_gb: int = (
+            eviction_policy.eviction_free_mem_threshold_gb
+        )
+        logging.info(
+            f"[FREE_MEM Eviction] eviction config, trigger model: FREE_MEM, {self.eviction_free_mem_check_interval_batch=}, {self.eviction_free_mem_threshold_gb=}"
+        )
+    def may_trigger_eviction(self) -> None:
+        def is_first_tbe() -> bool:
+            first = SSDTableBatchedEmbeddingBags._first_instance_ref
+            return first is not None and first() is self
+        # We assume that the eviction time is less than free mem check interval time
+        # So every time we reach this check, all evictions in all tbes should be finished.
+        # We only need to check the first tbe because all tbes share the same free mem,
+        # once the first tbe detect need to trigger eviction, it will call trigger func
+        # in all tbes from _all_tbe_instances
+        if (
+            self.enable_free_mem_trigger_eviction
+            and self.step % self.eviction_free_mem_check_interval_batch == 0
+            and self.training
+            and is_first_tbe()
+        ):
+            if not SSDTableBatchedEmbeddingBags.tbe_has_ongoing_eviction():
+                SSDTableBatchedEmbeddingBags._eviction_triggered = False
+            free_cpu_mem_gb = self.get_free_cpu_memory_gb()
+            local_evict_trigger = int(
+                free_cpu_mem_gb < self.eviction_free_mem_threshold_gb
+            )
+            tensor_flag = torch.tensor(
+                local_evict_trigger,
+                device=self.current_device,
+                dtype=torch.int,
+            )
+            world_size = dist.get_world_size(self._pg)
+            if world_size > 1:
+                dist.all_reduce(tensor_flag, op=dist.ReduceOp.SUM, group=self._pg)
+                global_evict_trigger = tensor_flag.item()
+            else:
+                global_evict_trigger = local_evict_trigger
+            if (
+                global_evict_trigger >= 1
+                and SSDTableBatchedEmbeddingBags._eviction_triggered
+            ):
+                logging.warning(
+                    f"[FREE_MEM Eviction] {global_evict_trigger} ranks triggered eviction, but SSDTableBatchedEmbeddingBags._eviction_triggered is true"
+                )
+            if (
+                global_evict_trigger >= 1
+                and not SSDTableBatchedEmbeddingBags._eviction_triggered
+            ):
+                SSDTableBatchedEmbeddingBags._eviction_triggered = True
+                SSDTableBatchedEmbeddingBags.trigger_evict_in_all_tbes()
+                logging.info(
+                    f"[FREE_MEM Eviction] Evict all at batch {self.step}, {free_cpu_mem_gb} GB free CPU memory, {global_evict_trigger} ranks triggered eviction"
+                )
+    def reset_inference_mode(self) -> None:
+        """
+        Reset the inference mode
+        """
+        self.eval()