PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/tbe/ssd/training.py CHANGED Viewed

@@ -14,13 +14,13 @@ import itertools
 import logging
 import math
 import os
-import tempfile
 import threading
 import time
 from functools import cached_property
-from math import ceil, floor, log2
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from math import floor, log2
+from typing import Any, Callable, ClassVar, Optional, Union
 import torch  # usort:skip
+import weakref
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
@@ -35,6 +35,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     BoundsCheckMode,
     CacheAlgorithm,
     EmbeddingLocation,
+    EvictionPolicy,
     get_bounds_check_version_for_platform,
     KVZCHParams,
     PoolingMode,
@@ -49,10 +50,12 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
     WeightDecayMode,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
+    check_allocated_vbe_output,
     generate_vbe_metadata,
     is_torchdynamo_compiling,
 )
 from torch import distributed as dist, nn, Tensor  # usort:skip
+import sys
 from dataclasses import dataclass
 from torch.autograd.profiler import record_function
@@ -76,10 +79,10 @@ class IterData:
 @dataclass
 class KVZCHCachedData:
-    cached_optimizer_states_per_table: List[List[torch.Tensor]]
-    cached_weight_tensor_per_table: List[torch.Tensor]
-    cached_id_tensor_per_table: List[torch.Tensor]
-    cached_bucket_splits: List[torch.Tensor]
+    cached_optimizer_states_per_table: list[list[torch.Tensor]]
+    cached_weight_tensor_per_table: list[torch.Tensor]
+    cached_id_tensor_per_table: list[torch.Tensor]
+    cached_bucket_splits: list[torch.Tensor]
 class SSDTableBatchedEmbeddingBags(nn.Module):
@@ -100,13 +103,18 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     weights_offsets: Tensor
     _local_instance_index: int = -1
     res_params: RESParams
-    table_names: List[str]
+    table_names: list[str]
+    _all_tbe_instances: ClassVar[weakref.WeakSet] = weakref.WeakSet()
+    _first_instance_ref: ClassVar[weakref.ref] = None
+    _eviction_triggered: ClassVar[bool] = False
     def __init__(
         self,
-        embedding_specs: List[Tuple[int, int]],  # tuple of (rows, dims)
-        feature_table_map: Optional[List[int]],  # [T]
+        embedding_specs: list[tuple[int, int]],  # tuple of (rows, dims)
+        feature_table_map: Optional[list[int]],  # [T]
         cache_sets: int,
+        # A comma-separated string, e.g. "/data00_nvidia0,/data01_nvidia0/", db shards
+        # will be placed in these paths round-robin.
         ssd_storage_directory: str,
         ssd_rocksdb_shards: int = 1,
         ssd_memtable_flush_period: int = -1,
@@ -146,13 +154,16 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         pooling_mode: PoolingMode = PoolingMode.SUM,
         bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
         # Parameter Server Configs
-        ps_hosts: Optional[Tuple[Tuple[str, int]]] = None,
+        ps_hosts: Optional[tuple[tuple[str, int]]] = None,
         ps_max_key_per_request: Optional[int] = None,
         ps_client_thread_num: Optional[int] = None,
         ps_max_local_index_length: Optional[int] = None,
         tbe_unique_id: int = -1,
-        # in local test we need to use the pass in path for rocksdb creation
-        # in production we need to do it inside SSD mount path which will ignores the passed in path
+        # If set to True, will use `ssd_storage_directory` as the ssd paths.
+        # If set to False, will use the default ssd paths.
+        # In local test we need to use the pass in path for rocksdb creation
+        # fn production we could either use the default ssd mount points or explicity specify ssd
+        # mount points using `ssd_storage_directory`.
         use_passed_in_path: int = True,
         gather_ssd_cache_stats: Optional[bool] = False,
         stats_reporter_config: Optional[TBEStatsReporterConfig] = None,
@@ -172,14 +183,18 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         enable_raw_embedding_streaming: bool = False,  # whether enable raw embedding streaming
         res_params: Optional[RESParams] = None,  # raw embedding streaming sharding info
         flushing_block_size: int = 2_000_000_000,  # 2GB
-        table_names: Optional[List[str]] = None,
-        optimizer_state_dtypes: Dict[str, SparseType] = {},  # noqa: B006
+        table_names: Optional[list[str]] = None,
+        use_rowwise_bias_correction: bool = False,  # For Adam use
+        optimizer_state_dtypes: dict[str, SparseType] = {},  # noqa: B006
+        pg: Optional[dist.ProcessGroup] = None,
     ) -> None:
         super(SSDTableBatchedEmbeddingBags, self).__init__()
         # Set the optimizer
         assert optimizer in (
             OptimType.EXACT_ROWWISE_ADAGRAD,
+            OptimType.PARTIAL_ROWWISE_ADAM,
+            OptimType.ADAM,
         ), f"Optimizer {optimizer} is not supported by SSDTableBatchedEmbeddingBags"
         self.optimizer = optimizer
@@ -187,15 +202,28 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         assert weights_precision in (SparseType.FP32, SparseType.FP16)
         self.weights_precision = weights_precision
         self.output_dtype: int = output_dtype.as_int()
-        self.optimizer_state_dtypes: Dict[str, SparseType] = optimizer_state_dtypes
+        if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
+            # Adagrad currently only supports FP32 for momentum1
+            self.optimizer_state_dtypes: dict[str, SparseType] = {
+                "momentum1": SparseType.FP32,
+            }
+        else:
+            self.optimizer_state_dtypes: dict[str, SparseType] = optimizer_state_dtypes
         # Zero collision TBE configurations
         self.kv_zch_params = kv_zch_params
         self.backend_type = backend_type
         self.enable_optimizer_offloading: bool = False
         self.backend_return_whole_row: bool = False
+        self._embedding_cache_mode: bool = False
+        self.load_ckpt_without_opt: bool = False
         if self.kv_zch_params:
             self.kv_zch_params.validate()
+            self.load_ckpt_without_opt = (
+                # pyre-ignore [16]
+                self.kv_zch_params.load_ckpt_without_opt
+            )
             self.enable_optimizer_offloading = (
                 # pyre-ignore [16]
                 self.kv_zch_params.enable_optimizer_offloading
@@ -214,12 +242,43 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 logging.info(
                     "Backend will return whole row including metaheader, weight and optimizer for checkpoint"
                 )
+            # pyre-ignore [16]
+            self._embedding_cache_mode = self.kv_zch_params.embedding_cache_mode
+            if self._embedding_cache_mode:
+                logging.info("KVZCH is in embedding_cache_mode")
+                assert self.optimizer in [
+                    OptimType.EXACT_ROWWISE_ADAGRAD
+                ], f"only EXACT_ROWWISE_ADAGRAD supports embedding cache mode, but got {self.optimizer}"
+            if self.load_ckpt_without_opt:
+                if (
+                    # pyre-ignore [16]
+                    self.kv_zch_params.optimizer_type_for_st
+                    == OptimType.PARTIAL_ROWWISE_ADAM.value
+                ):
+                    self.optimizer = OptimType.PARTIAL_ROWWISE_ADAM
+                    logging.info(
+                        f"Override optimizer type with {self.optimizer=} for st publish"
+                    )
+                if (
+                    # pyre-ignore [16]
+                    self.kv_zch_params.optimizer_state_dtypes_for_st
+                    is not None
+                ):
+                    optimizer_state_dtypes = {}
+                    for k, v in dict(
+                        self.kv_zch_params.optimizer_state_dtypes_for_st
+                    ).items():
+                        optimizer_state_dtypes[k] = SparseType.from_int(v)
+                    self.optimizer_state_dtypes = optimizer_state_dtypes
+                    logging.info(
+                        f"Override optimizer_state_dtypes with {self.optimizer_state_dtypes=} for st publish"
+                    )
         self.pooling_mode = pooling_mode
         self.bounds_check_mode_int: int = bounds_check_mode.value
         self.embedding_specs = embedding_specs
         self.table_names = table_names if table_names is not None else []
-        (rows, dims) = zip(*embedding_specs)
+        rows, dims = zip(*embedding_specs)
         T_ = len(self.embedding_specs)
         assert T_ > 0
         # pyre-fixme[8]: Attribute has type `device`; used as `int`.
@@ -238,7 +297,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 f"get env {self.res_params.res_server_port=}, at rank {dist.get_rank()}, with {self.res_params=}"
             )
-        self.feature_table_map: List[int] = (
+        self.feature_table_map: list[int] = (
             feature_table_map if feature_table_map is not None else list(range(T_))
         )
         T = len(self.feature_table_map)
@@ -318,7 +377,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             torch.tensor(dims, device="cpu", dtype=torch.int64),
         )
-        (info_B_num_bits_, info_B_mask_) = torch.ops.fbgemm.get_infos_metadata(
+        info_B_num_bits_, info_B_mask_ = torch.ops.fbgemm.get_infos_metadata(
             self.D_offsets,  # unused tensor
             1,  # max_B
             T,  # T
@@ -514,11 +573,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.record_function_via_dummy_profile_factory(use_dummy_profile)
         )
-        os.makedirs(ssd_storage_directory, exist_ok=True)
+        if use_passed_in_path:
+            ssd_dir_list = ssd_storage_directory.split(",")
+            for ssd_dir in ssd_dir_list:
+                os.makedirs(ssd_dir, exist_ok=True)
-        ssd_directory = tempfile.mkdtemp(
-            prefix="ssd_table_batched_embeddings", dir=ssd_storage_directory
-        )
+        ssd_directory = ssd_storage_directory
         # logging.info("DEBUG: weights_precision {}".format(weights_precision))
         """
@@ -538,10 +598,16 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         """
         self._cached_kvzch_data: Optional[KVZCHCachedData] = None
         # initial embedding rows on this rank per table, this is used for loading checkpoint
-        self.local_weight_counts: List[int] = [0] * T_
+        self.local_weight_counts: list[int] = [0] * T_
+        # groundtruth global id on this rank per table, this is used for loading checkpoint
+        self.global_id_per_rank: list[torch.Tensor] = [torch.zeros(0)] * T_
         # loading checkpoint flag, set by checkpoint loader, and cleared after weight is applied to backend
         self.load_state_dict: bool = False
+        SSDTableBatchedEmbeddingBags._all_tbe_instances.add(self)
+        if SSDTableBatchedEmbeddingBags._first_instance_ref is None:
+            SSDTableBatchedEmbeddingBags._first_instance_ref = weakref.ref(self)
         # create tbe unique id using rank index | local tbe idx
         if tbe_unique_id == -1:
             SSDTableBatchedEmbeddingBags._local_instance_index += 1
@@ -559,6 +625,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.tbe_unique_id = tbe_unique_id
         self.l2_cache_size = l2_cache_size
         logging.info(f"tbe_unique_id: {tbe_unique_id}")
+        self.enable_free_mem_trigger_eviction: bool = False
         if self.backend_type == BackendType.SSD:
             logging.info(
                 f"Logging SSD offloading setup, tbe_unique_id:{tbe_unique_id}, l2_cache_size:{l2_cache_size}GB, "
@@ -614,6 +681,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     else None
                 ),
                 flushing_block_size,
+                self._embedding_cache_mode,  # disable_random_init
             )
             if self.bulk_init_chunk_size > 0:
                 self.ssd_uniform_init_lower: float = ssd_uniform_init_lower
@@ -662,18 +730,41 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     if self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
                     else self.l2_cache_size
                 )
+                kv_zch_params = self.kv_zch_params
+                eviction_policy = self.kv_zch_params.eviction_policy
+                if eviction_policy.eviction_trigger_mode == 5:
+                    # If trigger mode is free_mem(5), populate config
+                    self.set_free_mem_eviction_trigger_config(eviction_policy)
+                enable_eviction_for_feature_score_eviction_policy = (  # pytorch api in c++ doesn't support vertor<bool>, convert to int here, 0: no eviction 1: eviction
+                    [
+                        int(x)
+                        for x in eviction_policy.enable_eviction_for_feature_score_eviction_policy
+                    ]
+                    if eviction_policy.enable_eviction_for_feature_score_eviction_policy
+                    is not None
+                    else None
+                )
+                # Please refer to https://fburl.com/gdoc/nuupjwqq for the following eviction parameters.
                 eviction_config = torch.classes.fbgemm.FeatureEvictConfig(
-                    self.kv_zch_params.eviction_policy.eviction_trigger_mode,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
-                    self.kv_zch_params.eviction_policy.eviction_strategy,  # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
-                    self.kv_zch_params.eviction_policy.eviction_step_intervals,  # trigger_step_interval if trigger mode is iteration
+                    eviction_policy.eviction_trigger_mode,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual, 4: id count
+                    eviction_policy.eviction_strategy,  # evict_trigger_strategy: 0: timestamp, 1: counter, 2: counter + timestamp, 3: feature l2 norm, 4: timestamp threshold 5: feature score
+                    eviction_policy.eviction_step_intervals,  # trigger_step_interval if trigger mode is iteration
                     eviction_mem_threshold_gb,  # mem_util_threshold_in_GB if trigger mode is mem_util
-                    self.kv_zch_params.eviction_policy.ttls_in_mins,  # ttls_in_mins for each table if eviction strategy is timestamp
-                    self.kv_zch_params.eviction_policy.counter_thresholds,  # counter_thresholds for each table if eviction strategy is feature score
-                    self.kv_zch_params.eviction_policy.counter_decay_rates,  # counter_decay_rates for each table if eviction strategy is feature score
-                    self.kv_zch_params.eviction_policy.l2_weight_thresholds,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                    eviction_policy.ttls_in_mins,  # ttls_in_mins for each table if eviction strategy is timestamp
+                    eviction_policy.counter_thresholds,  # counter_thresholds for each table if eviction strategy is counter
+                    eviction_policy.counter_decay_rates,  # counter_decay_rates for each table if eviction strategy is counter
+                    eviction_policy.feature_score_counter_decay_rates,  # feature_score_counter_decay_rates for each table if eviction strategy is feature score
+                    eviction_policy.training_id_eviction_trigger_count,  # training_id_eviction_trigger_count for each table
+                    eviction_policy.training_id_keep_count,  # training_id_keep_count for each table
+                    enable_eviction_for_feature_score_eviction_policy,  # no eviction setting for feature score eviction policy
+                    eviction_policy.l2_weight_thresholds,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
                     table_dims.tolist() if table_dims is not None else None,
-                    self.kv_zch_params.eviction_policy.interval_for_insufficient_eviction_s,
-                    self.kv_zch_params.eviction_policy.interval_for_sufficient_eviction_s,
+                    eviction_policy.threshold_calculation_bucket_stride,  # threshold_calculation_bucket_stride if eviction strategy is feature score
+                    eviction_policy.threshold_calculation_bucket_num,  # threshold_calculation_bucket_num if eviction strategy is feature score
+                    eviction_policy.interval_for_insufficient_eviction_s,
+                    eviction_policy.interval_for_sufficient_eviction_s,
+                    eviction_policy.interval_for_feature_statistics_decay_s,
                 )
             self._ssd_db = torch.classes.fbgemm.DramKVEmbeddingCacheWrapper(
                 self.cache_row_dim,
@@ -690,16 +781,20 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     else None
                 ),  # hash_size_cumsum
                 self.backend_return_whole_row,  # backend_return_whole_row
+                False,  # enable_async_update
+                self._embedding_cache_mode,  # disable_random_init
             )
         else:
             raise AssertionError(f"Invalid backend type {self.backend_type}")
         # pyre-fixme[20]: Argument `self` expected.
-        (low_priority, high_priority) = torch.cuda.Stream.priority_range()
+        low_priority, high_priority = torch.cuda.Stream.priority_range()
         # GPU stream for SSD cache eviction
         self.ssd_eviction_stream = torch.cuda.Stream(priority=low_priority)
-        # GPU stream for SSD memory copy
+        # GPU stream for SSD memory copy (also reused for feature score D2H)
         self.ssd_memcpy_stream = torch.cuda.Stream(priority=low_priority)
+        # GPU stream for async metadata operation
+        self.feature_score_stream = torch.cuda.Stream(priority=low_priority)
         # SSD get completion event
         self.ssd_event_get = torch.cuda.Event()
@@ -711,6 +806,17 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.ssd_event_backward = torch.cuda.Event()
         # SSD get's input copy completion event
         self.ssd_event_get_inputs_cpy = torch.cuda.Event()
+        if self._embedding_cache_mode:
+            # Direct write embedding completion event
+            self.direct_write_l1_complete_event: torch.cuda.streams.Event = (
+                torch.cuda.Event()
+            )
+            self.direct_write_sp_complete_event: torch.cuda.streams.Event = (
+                torch.cuda.Event()
+            )
+        # Prefetch operation completion event
+        self.prefetch_complete_event = torch.cuda.Event()
         if self.prefetch_pipeline:
             # SSD scratch pad index queue insert completion event
             self.ssd_event_sp_idxq_insert: torch.cuda.streams.Event = torch.cuda.Event()
@@ -771,22 +877,22 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             )
             # (Indices, Count)
-            self.prefetched_info: List[Tuple[Tensor, Tensor]] = []
+            self.prefetched_info: list[tuple[Tensor, Tensor]] = []
-        self.timesteps_prefetched: List[int] = []
+        self.timesteps_prefetched: list[int] = []
         # TODO: add type annotation
         # pyre-fixme[4]: Attribute must be annotated.
         self.ssd_prefetch_data = []
         # Scratch pad eviction data queue
-        self.ssd_scratch_pad_eviction_data: List[
-            Tuple[Tensor, Tensor, Tensor, bool]
+        self.ssd_scratch_pad_eviction_data: list[
+            tuple[Tensor, Tensor, Tensor, bool]
         ] = []
-        self.ssd_location_update_data: List[Tuple[Tensor, Tensor]] = []
+        self.ssd_location_update_data: list[tuple[Tensor, Tensor]] = []
         if self.prefetch_pipeline:
             # Scratch pad value queue
-            self.ssd_scratch_pads: List[Tuple[Tensor, Tensor, Tensor]] = []
+            self.ssd_scratch_pads: list[tuple[Tensor, Tensor, Tensor]] = []
             # pyre-ignore[4]
             # Scratch pad index queue
@@ -835,7 +941,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             weight_norm_coefficient=cowclip_regularization.weight_norm_coefficient,
             lower_bound=cowclip_regularization.lower_bound,
             regularization_mode=weight_decay_mode.value,
-            use_rowwise_bias_correction=False,  # Unused, this is used in TBE's Adam
+            use_rowwise_bias_correction=use_rowwise_bias_correction,  # Used in Adam optimizer
         )
         table_embedding_dtype = weights_precision.as_dtype()
@@ -888,7 +994,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.ssd_cache_stats_size = 6
         # 0: N_calls, 1: N_requested_indices, 2: N_unique_indices, 3: N_unique_misses,
         # 4: N_conflict_unique_misses, 5: N_conflict_misses
-        self.last_reported_ssd_stats: List[float] = []
+        self.last_reported_ssd_stats: list[float] = []
         self.last_reported_step = 0
         self.register_buffer(
@@ -919,7 +1025,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.prefetch_parallel_stream_cnt: int = 2
         # tuple of iteration, prefetch parallel stream cnt, reported duration
         # since there are 2 stream in parallel in prefetch, we want to count the longest one
-        self.prefetch_duration_us: Tuple[int, int, float] = (
+        self.prefetch_duration_us: tuple[int, int, float] = (
             -1,
             self.prefetch_parallel_stream_cnt,
             0,
@@ -945,6 +1051,20 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.dram_kv_allocated_bytes_stats_name: str = (
             f"dram_kv.mem.tbe_id{tbe_unique_id}.allocated_bytes"
         )
+        self.dram_kv_mem_num_rows_stats_name: str = (
+            f"dram_kv.mem.tbe_id{tbe_unique_id}.num_rows"
+        )
+        self.eviction_sum_evicted_counts_stats_name: str = (
+            f"eviction.tbe_id.{tbe_unique_id}.sum_evicted_counts"
+        )
+        self.eviction_sum_processed_counts_stats_name: str = (
+            f"eviction.tbe_id.{tbe_unique_id}.sum_processed_counts"
+        )
+        self.eviction_evict_rate_stats_name: str = (
+            f"eviction.tbe_id.{tbe_unique_id}.evict_rate"
+        )
         if self.stats_reporter:
             self.ssd_prefetch_read_timer = AsyncSeriesTimer(
                 functools.partial(
@@ -972,9 +1092,41 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.stats_reporter.register_stats(
                 self.dram_kv_actual_used_chunk_bytes_stats_name
             )
+            self.stats_reporter.register_stats(self.dram_kv_mem_num_rows_stats_name)
+            self.stats_reporter.register_stats(
+                self.eviction_sum_evicted_counts_stats_name
+            )
+            self.stats_reporter.register_stats(
+                self.eviction_sum_processed_counts_stats_name
+            )
+            self.stats_reporter.register_stats(self.eviction_evict_rate_stats_name)
+            for t in self.feature_table_map:
+                self.stats_reporter.register_stats(
+                    f"eviction.feature_table.{t}.evicted_counts"
+                )
+                self.stats_reporter.register_stats(
+                    f"eviction.feature_table.{t}.processed_counts"
+                )
+                self.stats_reporter.register_stats(
+                    f"eviction.feature_table.{t}.evict_rate"
+                )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.full_duration_ms"
+            )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.exec_duration_ms"
+            )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.dry_run_exec_duration_ms"
+            )
+            self.stats_reporter.register_stats(
+                "eviction.feature_table.exec_div_full_duration_rate"
+            )
         self.bounds_check_version: int = get_bounds_check_version_for_platform()
+        self._pg = pg
     @cached_property
     def cache_row_dim(self) -> int:
         """
@@ -982,7 +1134,9 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         padding to the nearest 4 elements and the optimizer state appended to
         the back of the row
         """
-        if self.enable_optimizer_offloading:
+        # For st publish, we only need to load weight for publishing and bulk eval
+        if self.enable_optimizer_offloading and not self.load_ckpt_without_opt:
             return self.max_D + pad4(
                 # Compute the number of elements of cache_dtype needed to store
                 # the optimizer state
@@ -1182,10 +1336,10 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self,
         split: SplitState,
         prefix: str,
-        dtype: Type[torch.dtype],
+        dtype: type[torch.dtype],
         enforce_hbm: bool = False,
         make_dev_param: bool = False,
-        dev_reshape: Optional[Tuple[int, ...]] = None,
+        dev_reshape: Optional[tuple[int, ...]] = None,
     ) -> None:
         apply_split_helper(
             self.register_buffer,
@@ -1208,11 +1362,11 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def to_pinned_cpu_on_stream_wait_on_another_stream(
         self,
-        tensors: List[Tensor],
+        tensors: list[Tensor],
         stream: torch.cuda.Stream,
         stream_to_wait_on: torch.cuda.Stream,
         post_event: Optional[torch.cuda.Event] = None,
-    ) -> List[Tensor]:
+    ) -> list[Tensor]:
         """
         Transfer input tensors from GPU to CPU using a pinned host
         buffer.  The transfer is carried out on the given stream
@@ -1274,6 +1428,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         Returns:
             None
         """
+        if not self.training:  # if not training, freeze the embedding
+            return
         with record_function(f"## ssd_evict_{name} ##"):
             with torch.cuda.stream(stream):
                 if pre_event is not None:
@@ -1286,7 +1442,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 self.record_function_via_dummy_profile(
                     f"## ssd_set_{name} ##",
                     self.ssd_db.set_cuda,
-                    indices_cpu.cpu(),
+                    indices_cpu,
                     rows_cpu,
                     actions_count_cpu,
                     self.timestep,
@@ -1450,7 +1606,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def _update_cache_counter_and_pointers(
         self,
         module: nn.Module,
-        grad_input: Union[Tuple[Tensor, ...], Tensor],
+        grad_input: Union[tuple[Tensor, ...], Tensor],
     ) -> None:
         """
         Update cache line locking counter and pointers before backward
@@ -1535,9 +1691,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             if len(self.ssd_location_update_data) == 0:
                 return
-            (sp_curr_next_map, inserted_rows_next) = self.ssd_location_update_data.pop(
-                0
-            )
+            sp_curr_next_map, inserted_rows_next = self.ssd_location_update_data.pop(0)
             # Update poitners
             torch.ops.fbgemm.ssd_update_row_addrs(
@@ -1552,12 +1706,63 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 unique_indices_length_curr=curr_data.actions_count_gpu,
             )
+    def _update_feature_score_metadata(
+        self,
+        linear_cache_indices: Tensor,
+        weights: Tensor,
+        d2h_stream: torch.cuda.Stream,
+        write_stream: torch.cuda.Stream,
+        pre_event_for_write: torch.cuda.Event,
+        post_event: Optional[torch.cuda.Event] = None,
+    ) -> None:
+        """
+        Write feature score metadata to DRAM
+        This method performs D2H copy on d2h_stream, then writes to DRAM on write_stream.
+        The caller is responsible for ensuring d2h_stream doesn't compete with other D2H operations.
+        Args:
+            linear_cache_indices: GPU tensor containing cache indices
+            weights: GPU tensor containing feature scores
+            d2h_stream: Stream for D2H copy operation (should already be synchronized appropriately)
+            write_stream: Stream for metadata write operation
+            pre_event_for_write: Event to wait on before writing metadata (e.g., wait for eviction)
+            post_event: Event to record when the operation is done
+        """
+        # Start D2H copy on d2h_stream
+        with torch.cuda.stream(d2h_stream):
+            # Record streams to prevent premature deallocation
+            linear_cache_indices.record_stream(d2h_stream)
+            weights.record_stream(d2h_stream)
+            # Do the D2H copy
+            linear_cache_indices_cpu = self.to_pinned_cpu(linear_cache_indices)
+            score_weights_cpu = self.to_pinned_cpu(weights)
+        # Write feature score metadata to DRAM
+        with record_function("## ssd_write_feature_score_metadata ##"):
+            with torch.cuda.stream(write_stream):
+                write_stream.wait_event(pre_event_for_write)
+                write_stream.wait_stream(d2h_stream)
+                self.record_function_via_dummy_profile(
+                    "## ssd_write_feature_score_metadata ##",
+                    self.ssd_db.set_feature_score_metadata_cuda,
+                    linear_cache_indices_cpu,
+                    torch.tensor(
+                        [score_weights_cpu.shape[0]], device="cpu", dtype=torch.long
+                    ),
+                    score_weights_cpu,
+                )
+                if post_event is not None:
+                    write_stream.record_event(post_event)
     def prefetch(
         self,
         indices: Tensor,
         offsets: Tensor,
+        weights: Optional[Tensor] = None,  # todo: need to update caller
         forward_stream: Optional[torch.cuda.Stream] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
     ) -> None:
         if self.prefetch_stream is None and forward_stream is not None:
             # Set the prefetch stream to the current stream
@@ -1581,6 +1786,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self._prefetch(
             indices,
             offsets,
+            weights,
             vbe_metadata,
             forward_stream,
         )
@@ -1589,11 +1795,17 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self,
         indices: Tensor,
         offsets: Tensor,
+        weights: Optional[Tensor] = None,
         vbe_metadata: Optional[invokers.lookup_args.VBEMetadata] = None,
         forward_stream: Optional[torch.cuda.Stream] = None,
     ) -> None:
-        # TODO: Refactor prefetch
+        # Wait for any ongoing direct_write_embedding operations to complete
+        # Moving this from forward() to _prefetch() is more logical as direct_write
+        # operations affect the same cache structures that prefetch interacts with
         current_stream = torch.cuda.current_stream()
+        if self._embedding_cache_mode:
+            current_stream.wait_event(self.direct_write_l1_complete_event)
+            current_stream.wait_event(self.direct_write_sp_complete_event)
         B_offsets = None
         max_B = -1
@@ -1700,8 +1912,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                             name="cache_update",
                         )
                         current_stream.wait_event(self.ssd_event_cache_streaming_synced)
-                        (updated_indices, updated_counts_gpu) = (
-                            self.prefetched_info.pop(0)
+                        updated_indices, updated_counts_gpu = self.prefetched_info.pop(
+                            0
                         )
                         self.lxu_cache_updated_indices[: updated_indices.size(0)].copy_(
                             updated_indices,
@@ -1878,12 +2090,13 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     # Store info for evicting the previous iteration's
                     # scratch pad after the corresponding backward pass is
                     # done
-                    self.ssd_location_update_data.append(
-                        (
-                            sp_curr_prev_map_gpu,
-                            inserted_rows,
+                    if self.training:
+                        self.ssd_location_update_data.append(
+                            (
+                                sp_curr_prev_map_gpu,
+                                inserted_rows,
+                            )
                         )
-                    )
             # Ensure the previous iterations eviction is complete
             current_stream.wait_event(self.ssd_event_sp_evict)
@@ -1931,7 +2144,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 self.ssd_cache_stats = torch.add(
                     self.ssd_cache_stats, self.local_ssd_cache_stats
                 )
-                self._report_kv_backend_stats()
+                # only report metrics from rank0 to avoid flooded logging
+                if dist.get_rank() == 0:
+                    self._report_kv_backend_stats()
+            # May trigger eviction if free mem trigger mode enabled before get cuda
+            self.may_trigger_eviction()
             # Fetch data from SSD
             if linear_cache_indices.numel() > 0:
@@ -1955,21 +2173,35 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 use_pipeline=self.prefetch_pipeline,
             )
-            if linear_cache_indices.numel() > 0:
-                # Evict rows from cache to SSD
-                self.evict(
-                    rows=self.lxu_cache_evicted_weights,
-                    indices_cpu=self.lxu_cache_evicted_indices,
-                    actions_count_cpu=self.lxu_cache_evicted_count,
-                    stream=self.ssd_eviction_stream,
-                    pre_event=self.ssd_event_get,
-                    # Record completion event after scratch pad eviction
-                    # instead since that happens after L1 eviction
-                    post_event=self.ssd_event_cache_evict,
-                    is_rows_uvm=True,
-                    name="cache",
-                    is_bwd=False,
-                )
+            if self.training:
+                if linear_cache_indices.numel() > 0:
+                    # Evict rows from cache to SSD
+                    self.evict(
+                        rows=self.lxu_cache_evicted_weights,
+                        indices_cpu=self.lxu_cache_evicted_indices,
+                        actions_count_cpu=self.lxu_cache_evicted_count,
+                        stream=self.ssd_eviction_stream,
+                        pre_event=self.ssd_event_get,
+                        # Record completion event after scratch pad eviction
+                        # instead since that happens after L1 eviction
+                        post_event=self.ssd_event_cache_evict,
+                        is_rows_uvm=True,
+                        name="cache",
+                        is_bwd=False,
+                    )
+                if (
+                    self.backend_type == BackendType.DRAM
+                    and weights is not None
+                    and linear_cache_indices.numel() > 0
+                ):
+                    # Reuse ssd_memcpy_stream for feature score D2H since critical D2H is done
+                    self._update_feature_score_metadata(
+                        linear_cache_indices=linear_cache_indices,
+                        weights=weights,
+                        d2h_stream=self.ssd_memcpy_stream,
+                        write_stream=self.feature_score_stream,
+                        pre_event_for_write=self.ssd_event_cache_evict,
+                    )
             # Generate row addresses (pointing to either L1 or the current
             # iteration's scratch pad)
@@ -2051,24 +2283,32 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     )
                 )
-            # Store scratch pad info for post backward eviction
-            self.ssd_scratch_pad_eviction_data.append(
-                (
-                    inserted_rows,
-                    post_bwd_evicted_indices_cpu,
-                    actions_count_cpu,
-                    linear_cache_indices.numel() > 0,
+            # Store scratch pad info for post backward eviction only for training
+            # for eval job, no backward pass, so no need to store this info
+            if self.training:
+                self.ssd_scratch_pad_eviction_data.append(
+                    (
+                        inserted_rows,
+                        post_bwd_evicted_indices_cpu,
+                        actions_count_cpu,
+                        linear_cache_indices.numel() > 0,
+                    )
                 )
-            )
             # Store data for forward
             self.ssd_prefetch_data.append(prefetch_data)
+            # Record an event to mark the completion of prefetch operations
+            # This will be used by direct_write_embedding to ensure it doesn't run concurrently with prefetch
+            current_stream.record_event(self.prefetch_complete_event)
     @torch.jit.ignore
     def _generate_vbe_metadata(
         self,
         offsets: Tensor,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]],
+        batch_size_per_feature_per_rank: Optional[list[list[int]]],
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -2087,6 +2327,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.pooling_mode,
             self.feature_dims,
             self.current_device,
+            vbe_output,
+            vbe_output_offsets,
         )
     def _increment_iteration(self) -> int:
@@ -2113,14 +2355,30 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self,
         indices: Tensor,
         offsets: Tensor,
+        weights: Optional[Tensor] = None,
         per_sample_weights: Optional[Tensor] = None,
         feature_requires_grad: Optional[Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
         # pyre-fixme[7]: Expected `Tensor` but got implicit return value of `None`.
     ) -> Tensor:
         self.clear_cache()
+        if vbe_output is not None or vbe_output_offsets is not None:
+            # CPU is not supported in SSD TBE
+            check_allocated_vbe_output(
+                self.output_dtype,
+                batch_size_per_feature_per_rank,
+                vbe_output,
+                vbe_output_offsets,
+            )
         indices, offsets, per_sample_weights, vbe_metadata = self.prepare_inputs(
-            indices, offsets, per_sample_weights, batch_size_per_feature_per_rank
+            indices,
+            offsets,
+            per_sample_weights,
+            batch_size_per_feature_per_rank,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
         if len(self.timesteps_prefetched) == 0:
@@ -2134,7 +2392,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 context=self.step,
                 stream=self.ssd_eviction_stream,
             ):
-                self._prefetch(indices, offsets, vbe_metadata)
+                self._prefetch(indices, offsets, weights, vbe_metadata)
         assert len(self.ssd_prefetch_data) > 0
@@ -2205,13 +2463,21 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.step += 1
         # Increment the iteration (value is used for certain optimizers)
-        self._increment_iteration()
-        if self.optimizer == OptimType.EXACT_SGD:
-            raise AssertionError(
-                "SSDTableBatchedEmbeddingBags currently does not support SGD"
+        iter_int = self._increment_iteration()
+        if self.optimizer in [OptimType.PARTIAL_ROWWISE_ADAM, OptimType.ADAM]:
+            momentum2 = invokers.lookup_args_ssd.Momentum(
+                # pyre-ignore[6]
+                dev=self.momentum2_dev,
+                # pyre-ignore[6]
+                host=self.momentum2_host,
+                # pyre-ignore[6]
+                uvm=self.momentum2_uvm,
+                # pyre-ignore[6]
+                offsets=self.momentum2_offsets,
+                # pyre-ignore[6]
+                placements=self.momentum2_placements,
             )
-            return invokers.lookup_sgd_ssd.invoke(common_args, self.optimizer_args)
         momentum1 = invokers.lookup_args_ssd.Momentum(
             dev=self.momentum1_dev,
@@ -2226,10 +2492,44 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 common_args, self.optimizer_args, momentum1
             )
+        elif self.optimizer == OptimType.PARTIAL_ROWWISE_ADAM:
+            return invokers.lookup_partial_rowwise_adam_ssd.invoke(
+                common_args,
+                self.optimizer_args,
+                momentum1,
+                # pyre-ignore[61]
+                momentum2,
+                iter_int,
+            )
+        elif self.optimizer == OptimType.ADAM:
+            row_counter = invokers.lookup_args_ssd.Momentum(
+                # pyre-fixme[6]
+                dev=self.row_counter_dev,
+                # pyre-fixme[6]
+                host=self.row_counter_host,
+                # pyre-fixme[6]
+                uvm=self.row_counter_uvm,
+                # pyre-fixme[6]
+                offsets=self.row_counter_offsets,
+                # pyre-fixme[6]
+                placements=self.row_counter_placements,
+            )
+            return invokers.lookup_adam_ssd.invoke(
+                common_args,
+                self.optimizer_args,
+                momentum1,
+                # pyre-ignore[61]
+                momentum2,
+                iter_int,
+                row_counter=row_counter,
+            )
     @torch.jit.ignore
     def _split_optimizer_states_non_kv_zch(
         self,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
         """
         Returns a list of optimizer states (view), split by table.
@@ -2246,11 +2546,11 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         """
         # Row count per table
-        (rows, dims) = zip(*self.embedding_specs)
+        rows, dims = zip(*self.embedding_specs)
         # Cumulative row counts per table for rowwise states
-        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows))
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
         # Cumulative element counts per table for elementwise states
-        elem_count_cumsum: List[int] = [0] + list(
+        elem_count_cumsum: list[int] = [0] + list(
             itertools.accumulate([r * d for r, d in self.embedding_specs])
         )
@@ -2286,6 +2586,17 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 ]
                 for t, _ in enumerate(rows)
             ]
+        elif self.optimizer == OptimType.ADAM:
+            return [
+                [
+                    _slice(self.momentum1_dev, t, rowwise=False),
+                    # pyre-ignore[6]
+                    _slice(self.momentum2_dev, t, rowwise=False),
+                ]
+                for t, _ in enumerate(rows)
+            ]
         else:
             raise NotImplementedError(
                 f"Getting optimizer states is not supported for {self.optimizer}"
@@ -2295,14 +2606,14 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def _split_optimizer_states_kv_zch_no_offloading(
         self,
         sorted_ids: torch.Tensor,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
         # Row count per table
-        (rows, dims) = zip(*self.embedding_specs)
+        rows, dims = zip(*self.embedding_specs)
         # Cumulative row counts per table for rowwise states
-        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows))
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
         # Cumulative element counts per table for elementwise states
-        elem_count_cumsum: List[int] = [0] + list(
+        elem_count_cumsum: list[int] = [0] + list(
             itertools.accumulate([r * d for r, d in self.embedding_specs])
         )
@@ -2332,7 +2643,9 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 # based on the sorted_ids compute the table offset for the
                 # table, view the slice as 2D tensor of e x d, then fetch the
                 # sub-slice by local ids
-                local_ids = sorted_ids[t] - bucket_id_start * bucket_size
+                #
+                # local_ids is [N, 1], flatten it to N to keep the returned tensor 2D
+                local_ids = (sorted_ids[t] - bucket_id_start * bucket_size).view(-1)
                 return (
                     tensor.detach()
                     .cpu()[elem_count_cumsum[t] : elem_count_cumsum[t + 1]]
@@ -2364,6 +2677,16 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 for t, _ in enumerate(rows)
             ]
+        elif self.optimizer == OptimType.ADAM:
+            return [
+                [
+                    _slice("momentum1", self.momentum1_dev, t, rowwise=False),
+                    # pyre-ignore[6]
+                    _slice("momentum2", self.momentum2_dev, t, rowwise=False),
+                ]
+                for t, _ in enumerate(rows)
+            ]
         else:
             raise NotImplementedError(
                 f"Getting optimizer states is not supported for {self.optimizer}"
@@ -2375,12 +2698,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         sorted_ids: torch.Tensor,
         no_snapshot: bool = True,
         should_flush: bool = False,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
         dtype = self.weights_precision.as_dtype()
         # Row count per table
-        (rows_, dims_) = zip(*self.embedding_specs)
+        rows_, dims_ = zip(*self.embedding_specs)
         # Cumulative row counts per table for rowwise states
-        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows_))
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows_))
         snapshot_handle, _ = self._may_create_snapshot_for_state_dict(
             no_snapshot=no_snapshot,
@@ -2390,7 +2713,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         # pyre-ignore[53]
         def _fetch_offloaded_optimizer_states(
             t: int,
-        ) -> List[Tensor]:
+        ) -> list[Tensor]:
             e: int = rows_[t]
             d: int = dims_[t]
@@ -2403,12 +2726,31 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             # Count of rows to fetch
             rows_to_fetch = sorted_ids[t].numel()
+            # Lookup the byte offsets for each optimizer state
+            optimizer_state_byte_offsets = self.optimizer.byte_offsets_along_row(
+                d, self.weights_precision, self.optimizer_state_dtypes
+            )
+            # Find the minimum start of all the start/end pairs - we have to
+            # offset the start/end pairs by this value to get the correct start/end
+            offset_ = min(
+                [start for _, (start, _) in optimizer_state_byte_offsets.items()]
+            )
+            # Update the start/end pairs to be relative to offset_
+            optimizer_state_byte_offsets = dict(
+                (k, (v1 - offset_, v2 - offset_))
+                for k, (v1, v2) in optimizer_state_byte_offsets.items()
+            )
             # Since the backend returns cache rows that pack the weights and
             # optimizer states together, reading the whole tensor could cause OOM,
             # so we use the KVTensorWrapper abstraction to query the backend and
             # fetch the data in chunks instead.
             tensor_wrapper = torch.classes.fbgemm.KVTensorWrapper(
-                shape=[e, self.optimizer_state_dim],
+                shape=[
+                    e,
+                    # Dim is terms of **weights** dtype
+                    self.optimizer_state_dim,
+                ],
                 dtype=dtype,
                 row_offset=row_offset,
                 snapshot_handle=snapshot_handle,
@@ -2421,19 +2763,6 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 else tensor_wrapper.set_dram_db_wrapper(self.ssd_db)
             )
-            # Lookup the byte offsets for each optimizer state
-            optimizer_state_byte_offsets = self.optimizer.byte_offsets_along_row(
-                d, self.weights_precision, self.optimizer_state_dtypes
-            )
-            # Since we will be working with buffer rows that contain the
-            # optimizer states only, we need to offset the byte offsets by
-            # D * dtype.itemsize
-            offset_ = d * dtype.itemsize
-            optimizer_state_byte_offsets = dict(
-                (k, (v1 - offset_, v2 - offset_))
-                for k, (v1, v2) in optimizer_state_byte_offsets.items()
-            )
             # Fetch the state size table for the given weights domension
             state_size_table = self.optimizer.state_size_table(d)
@@ -2462,10 +2791,10 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 )
             # Now split up the buffer into N views, N for each optimizer state
-            optimizer_states: List[Tensor] = []
+            optimizer_states: list[Tensor] = []
             for state_name in self.optimizer.state_names():
                 # Extract the offsets
-                (start, end) = optimizer_state_byte_offsets[state_name]
+                start, end = optimizer_state_byte_offsets[state_name]
                 state = optimizer_states_buffer.view(
                     # Force tensor to byte view
@@ -2500,13 +2829,150 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             for t, d in enumerate(dims_)
         ]
+    @torch.jit.ignore
+    def _split_optimizer_states_kv_zch_whole_row(
+        self,
+        sorted_ids: torch.Tensor,
+        no_snapshot: bool = True,
+        should_flush: bool = False,
+    ) -> list[list[torch.Tensor]]:
+        dtype = self.weights_precision.as_dtype()
+        # Row and dimension counts per table
+        # rows_ is only used here to compute the virtual table offsets
+        rows_, dims_ = zip(*self.embedding_specs)
+        # Cumulative row counts per (virtual) table for rowwise states
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows_))
+        snapshot_handle, _ = self._may_create_snapshot_for_state_dict(
+            no_snapshot=no_snapshot,
+            should_flush=should_flush,
+        )
+        # pyre-ignore[53]
+        def _fetch_offloaded_optimizer_states(
+            t: int,
+        ) -> list[Tensor]:
+            d: int = dims_[t]
+            # pyre-ignore[16]
+            bucket_id_start, _ = self.kv_zch_params.bucket_offsets[t]
+            # pyre-ignore[16]
+            bucket_size = self.kv_zch_params.bucket_sizes[t]
+            row_offset = row_count_cumsum[t] - (bucket_id_start * bucket_size)
+            # When backend returns whole row, the optimizer will be returned as
+            # PMT directly
+            if sorted_ids[t].size(0) == 0 and self.local_weight_counts[t] > 0:
+                logging.info(
+                    f"Before opt PMT loading, resetting id tensor with {self.local_weight_counts[t]}"
+                )
+                sorted_ids[t] = torch.zeros(
+                    (self.local_weight_counts[t], 1),
+                    device=torch.device("cpu"),
+                    dtype=torch.int64,
+                )
+            # Lookup the byte offsets for each optimizer state relative to the
+            # start of the weights
+            optimizer_state_byte_offsets = self.optimizer.byte_offsets_along_row(
+                d, self.weights_precision, self.optimizer_state_dtypes
+            )
+            # Get the number of elements (of the optimizer state dtype) per state
+            optimizer_state_size_table = self.optimizer.state_size_table(d)
+            # Get metaheader dimensions in number of elements of weight dtype
+            metaheader_dim = (
+                # pyre-ignore[16]
+                self.kv_zch_params.eviction_policy.meta_header_lens[t]
+            )
+            # Now split up the buffer into N views, N for each optimizer state
+            optimizer_states: list[PartiallyMaterializedTensor] = []
+            for state_name in self.optimizer.state_names():
+                state_dtype = self.optimizer_state_dtypes.get(
+                    state_name, SparseType.FP32
+                ).as_dtype()
+                # Get the size of the state in elements of the optimizer state,
+                # in terms of the **weights** dtype
+                state_size = math.ceil(
+                    optimizer_state_size_table[state_name]
+                    * state_dtype.itemsize
+                    / dtype.itemsize
+                )
+                # Extract the offsets relative to the start of the weights (in
+                # num bytes)
+                start, _ = optimizer_state_byte_offsets[state_name]
+                # Convert the start to number of elements in terms of the
+                # **weights** dtype, then add the mmetaheader dim offset
+                start = metaheader_dim + start // dtype.itemsize
+                shape = [
+                    (
+                        sorted_ids[t].size(0)
+                        if sorted_ids is not None and sorted_ids[t].size(0) > 0
+                        else self.local_weight_counts[t]
+                    ),
+                    (
+                        # Dim is in terms of the **weights** dtype
+                        state_size
+                    ),
+                ]
+                # NOTE: We have to view using the **weights** dtype, as
+                # there is currently a bug with KVTensorWrapper where using
+                # a different dtype does not result in the same bytes being
+                # returned, e.g.
+                #
+                # KVTensorWrapper(dtype=fp32, width_offset=130, shape=[N, 1])
+                #
+                # is NOT the same as
+                #
+                # KVTensorWrapper(dtype=fp16, width_offset=260, shape=[N, 2]).view(-1).view(fp32)
+                #
+                # TODO: Fix KVTensorWrapper to support viewing data under different dtypes
+                tensor_wrapper = torch.classes.fbgemm.KVTensorWrapper(
+                    shape=shape,
+                    dtype=(
+                        # NOTE: Use the *weights* dtype
+                        dtype
+                    ),
+                    row_offset=row_offset,
+                    snapshot_handle=snapshot_handle,
+                    sorted_indices=sorted_ids[t],
+                    width_offset=(
+                        # NOTE: Width offset is in terms of **weights** dtype
+                        start
+                    ),
+                    # Optimizer written to DB with weights, so skip write here
+                    read_only=True,
+                )
+                (
+                    tensor_wrapper.set_embedding_rocks_dp_wrapper(self.ssd_db)
+                    if self.backend_type == BackendType.SSD
+                    else tensor_wrapper.set_dram_db_wrapper(self.ssd_db)
+                )
+                optimizer_states.append(
+                    PartiallyMaterializedTensor(tensor_wrapper, True)
+                )
+            # pyre-ignore [7]
+            return optimizer_states
+        return [_fetch_offloaded_optimizer_states(t) for t, _ in enumerate(dims_)]
     @torch.jit.export
     def split_optimizer_states(
         self,
-        sorted_id_tensor: Optional[List[torch.Tensor]] = None,
+        sorted_id_tensor: Optional[list[torch.Tensor]] = None,
         no_snapshot: bool = True,
         should_flush: bool = False,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
         """
         Returns a list of optimizer states split by table.
@@ -2555,75 +3021,11 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             )
         else:
-            snapshot_handle, _ = self._may_create_snapshot_for_state_dict(
-                no_snapshot=no_snapshot,
-                should_flush=should_flush,
+            # Handle the KVZCH with-optimizer-offloading backend-whole-row case
+            optimizer_states = self._split_optimizer_states_kv_zch_whole_row(
+                sorted_id_tensor, no_snapshot, should_flush
             )
-            optimizer_states = []
-            table_offset = 0
-            for t, (emb_height, emb_dim) in enumerate(self.embedding_specs):
-                # pyre-ignore
-                bucket_id_start, _ = self.kv_zch_params.bucket_offsets[t]
-                # pyre-ignore
-                bucket_size = self.kv_zch_params.bucket_sizes[t]
-                row_offset = table_offset - (bucket_id_start * bucket_size)
-                # When backend returns whole row, the optimizer will be returned as PMT directly
-                # pyre-ignore [16]
-                if sorted_id_tensor[t].size(0) == 0 and self.local_weight_counts[t] > 0:
-                    logging.info(
-                        f"before opt PMT loading, resetting id tensor with {self.local_weight_counts[t]}"
-                    )
-                    # pyre-ignore [16]
-                    sorted_id_tensor[t] = torch.zeros(
-                        (self.local_weight_counts[t], 1),
-                        device=torch.device("cpu"),
-                        dtype=torch.int64,
-                    )
-                metaheader_dim = (
-                    # pyre-ignore[16]
-                    self.kv_zch_params.eviction_policy.meta_header_lens[t]
-                )
-                tensor_wrapper = torch.classes.fbgemm.KVTensorWrapper(
-                    shape=[
-                        (
-                            sorted_id_tensor[t].size(0)
-                            if sorted_id_tensor is not None
-                            and sorted_id_tensor[t].size(0) > 0
-                            else emb_height
-                        ),
-                        self.optimizer_state_dim,
-                    ],
-                    dtype=self.weights_precision.as_dtype(),
-                    row_offset=row_offset,
-                    snapshot_handle=snapshot_handle,
-                    sorted_indices=sorted_id_tensor[t],
-                    width_offset=(
-                        metaheader_dim  # metaheader is already padded so no need for pad4
-                        + pad4(emb_dim)
-                    ),
-                    read_only=True,  # optimizer written to DB with weights, so skip write here
-                )
-                (
-                    tensor_wrapper.set_embedding_rocks_dp_wrapper(self.ssd_db)
-                    if self.backend_type == BackendType.SSD
-                    else tensor_wrapper.set_dram_db_wrapper(self.ssd_db)
-                )
-                optimizer_states.append(
-                    [
-                        PartiallyMaterializedTensor(
-                            tensor_wrapper,
-                            True if self.kv_zch_params else False,
-                        )
-                    ]
-                )
-                table_offset += emb_height
         logging.info(
             f"KV ZCH tables split_optimizer_states query latency: {(time.time() - start_time) * 1000} ms, "
             # pyre-ignore[16]
@@ -2635,14 +3037,14 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     @torch.jit.export
     def get_optimizer_state(
         self,
-        sorted_id_tensor: Optional[List[torch.Tensor]],
+        sorted_id_tensor: Optional[list[torch.Tensor]],
         no_snapshot: bool = True,
         should_flush: bool = False,
-    ) -> List[Dict[str, torch.Tensor]]:
+    ) -> list[dict[str, torch.Tensor]]:
         """
         Returns a list of dictionaries of optimizer states split by table.
         """
-        states_list: List[List[Tensor]] = self.split_optimizer_states(
+        states_list: list[list[Tensor]] = self.split_optimizer_states(
             sorted_id_tensor=sorted_id_tensor,
             no_snapshot=no_snapshot,
             should_flush=should_flush,
@@ -2651,13 +3053,13 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         return [dict(zip(state_names, states)) for states in states_list]
     @torch.jit.export
-    def debug_split_embedding_weights(self) -> List[torch.Tensor]:
+    def debug_split_embedding_weights(self) -> list[torch.Tensor]:
         """
         Returns a list of weights, split by table.
         Testing only, very slow.
         """
-        (rows, _) = zip(*self.embedding_specs)
+        rows, _ = zip(*self.embedding_specs)
         rows_cumsum = [0] + list(itertools.accumulate(rows))
         splits = []
@@ -2738,15 +3140,48 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.flush(force=should_flush)
         return snapshot_handle, checkpoint_handle
+    def get_embedding_dim_for_kvt(
+        self, metaheader_dim: int, emb_dim: int, is_loading_checkpoint: bool
+    ) -> int:
+        if self.load_ckpt_without_opt:
+            # For silvertorch publish, we don't want to load opt into backend due to limited cpu memory in publish host.
+            # So we need to load the whole row into state dict which loading the checkpoint in st publish, then only save weight into backend, after that
+            # backend will only have metaheader + weight.
+            # For the first loading, we need to set dim with metaheader_dim + emb_dim + optimizer_state_dim, otherwise the checkpoint loadding will throw size mismatch error
+            # after the first loading, we only need to get metaheader+weight from backend for state dict, so we can set dim with metaheader_dim + emb
+            if is_loading_checkpoint:
+                return (
+                    (
+                        metaheader_dim  # metaheader is already padded
+                        + pad4(emb_dim)
+                        + pad4(self.optimizer_state_dim)
+                    )
+                    if self.backend_return_whole_row
+                    else emb_dim
+                )
+            else:
+                return metaheader_dim + pad4(emb_dim)
+        else:
+            return (
+                (
+                    metaheader_dim  # metaheader is already padded
+                    + pad4(emb_dim)
+                    + pad4(self.optimizer_state_dim)
+                )
+                if self.backend_return_whole_row
+                else emb_dim
+            )
     @torch.jit.export
     def split_embedding_weights(
         self,
         no_snapshot: bool = True,
         should_flush: bool = False,
-    ) -> Tuple[  # TODO: make this a NamedTuple for readability
-        Union[List[PartiallyMaterializedTensor], List[torch.Tensor]],
-        Optional[List[torch.Tensor]],
-        Optional[List[torch.Tensor]],
+    ) -> tuple[  # TODO: make this a NamedTuple for readability
+        Union[list[PartiallyMaterializedTensor], list[torch.Tensor]],
+        Optional[list[torch.Tensor]],
+        Optional[list[torch.Tensor]],
+        Optional[list[torch.Tensor]],
     ]:
         """
         This method is intended to be used by the checkpointing engine
@@ -2766,6 +3201,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             2nd arg: input id sorted in bucket id ascending order
             3rd arg: active id count per bucket id, tensor size is [bucket_id_end - bucket_id_start]
                     where for the i th element, we have i + bucket_id_start = global bucket id
+            4th arg: kvzch eviction metadata for each input id sorted in bucket id ascending order
         """
         snapshot_handle, checkpoint_handle = self._may_create_snapshot_for_state_dict(
             no_snapshot=no_snapshot,
@@ -2782,16 +3218,21 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 self._cached_kvzch_data.cached_weight_tensor_per_table,
                 self._cached_kvzch_data.cached_id_tensor_per_table,
                 self._cached_kvzch_data.cached_bucket_splits,
+                [],  # metadata tensor is not needed for checkpointing loading
             )
         start_time = time.time()
         pmt_splits = []
         bucket_sorted_id_splits = [] if self.kv_zch_params else None
         active_id_cnt_per_bucket_split = [] if self.kv_zch_params else None
+        metadata_splits = [] if self.kv_zch_params else None
+        skip_metadata = False
         table_offset = 0
         for i, (emb_height, emb_dim) in enumerate(self.embedding_specs):
+            is_loading_checkpoint = False
             bucket_ascending_id_tensor = None
             bucket_t = None
+            metadata_tensor = None
             row_offset = table_offset
             metaheader_dim = 0
             if self.kv_zch_params:
@@ -2823,6 +3264,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                         bucket_size,
                     )
                 )
+                metadata_tensor = self._ssd_db.get_kv_zch_eviction_metadata_by_snapshot(
+                    bucket_ascending_id_tensor + table_offset,
+                    torch.as_tensor(bucket_ascending_id_tensor.size(0)),
+                    snapshot_handle,
+                ).view(-1, 1)
                 # 3. convert local id back to global id
                 bucket_ascending_id_tensor.add_(bucket_id_start * bucket_size)
@@ -2833,16 +3280,32 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     logging.info(
                         f"before weight PMT loading, resetting id tensor with {self.local_weight_counts[i]}"
                     )
-                    bucket_ascending_id_tensor = torch.zeros(
-                        (self.local_weight_counts[i], 1),
-                        device=torch.device("cpu"),
-                        dtype=torch.int64,
-                    )
+                    if self.global_id_per_rank[i].numel() != 0:
+                        assert (
+                            self.local_weight_counts[i]
+                            == self.global_id_per_rank[i].numel()
+                        ), f"local weight count and global id per rank size mismatch, with {self.local_weight_counts[i]} and {self.global_id_per_rank[i].numel()}"
+                        bucket_ascending_id_tensor = self.global_id_per_rank[i].to(
+                            device=torch.device("cpu"), dtype=torch.int64
+                        )
+                    else:
+                        bucket_ascending_id_tensor = torch.zeros(
+                            (self.local_weight_counts[i], 1),
+                            device=torch.device("cpu"),
+                            dtype=torch.int64,
+                        )
+                    skip_metadata = True
+                    is_loading_checkpoint = True
                     # self.local_weight_counts[i] = 0  # Reset the count
                 # pyre-ignore [16] bucket_sorted_id_splits is not None
                 bucket_sorted_id_splits.append(bucket_ascending_id_tensor)
                 active_id_cnt_per_bucket_split.append(bucket_t)
+                if skip_metadata:
+                    metadata_splits = None
+                else:
+                    metadata_splits.append(metadata_tensor)
                 # for KV ZCH tbe, the sorted_indices is global id for checkpointing and publishing
                 # but in backend, local id is used during training, so the KVTensorWrapper need to convert global id to local id
@@ -2857,14 +3320,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                         if bucket_ascending_id_tensor is not None
                         else emb_height
                     ),
-                    (
-                        (
-                            metaheader_dim  # metaheader is already padded
-                            + pad4(emb_dim)
-                            + pad4(self.optimizer_state_dim)
-                        )
-                        if self.backend_return_whole_row
-                        else emb_dim
+                    self.get_embedding_dim_for_kvt(
+                        metaheader_dim, emb_dim, is_loading_checkpoint
                     ),
                 ],
                 dtype=dtype,
@@ -2876,6 +3333,11 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     bucket_ascending_id_tensor if self.kv_zch_params else None
                 ),
                 checkpoint_handle=checkpoint_handle,
+                only_load_weight=(
+                    True
+                    if self.load_ckpt_without_opt and is_loading_checkpoint
+                    else False
+                ),
             )
             (
                 tensor_wrapper.set_embedding_rocks_dp_wrapper(self.ssd_db)
@@ -2898,14 +3360,19 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 f"num ids list: {[ids.numel() for ids in bucket_sorted_id_splits]}"
             )
-        return (pmt_splits, bucket_sorted_id_splits, active_id_cnt_per_bucket_split)
+        return (
+            pmt_splits,
+            bucket_sorted_id_splits,
+            active_id_cnt_per_bucket_split,
+            metadata_splits,
+        )
     @torch.jit.ignore
     def _apply_state_dict_w_offloading(self) -> None:
         # Row count per table
-        (rows, _) = zip(*self.embedding_specs)
+        rows, _ = zip(*self.embedding_specs)
         # Cumulative row counts per table for rowwise states
-        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows))
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
         for t, _ in enumerate(self.embedding_specs):
             # pyre-ignore [16]
@@ -2932,9 +3399,9 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     @torch.jit.ignore
     def _apply_state_dict_no_offloading(self) -> None:
         # Row count per table
-        (rows, _) = zip(*self.embedding_specs)
+        rows, _ = zip(*self.embedding_specs)
         # Cumulative row counts per table for rowwise states
-        row_count_cumsum: List[int] = [0] + list(itertools.accumulate(rows))
+        row_count_cumsum: list[int] = [0] + list(itertools.accumulate(rows))
         def copy_optimizer_state_(dst: Tensor, src: Tensor, indices: Tensor) -> None:
             device = dst.device
@@ -2968,7 +3435,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             # Set up the plan for copying optimizer states over
             if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD:
                 mapping = [(opt_states[0], self.momentum1_dev)]
-            elif self.optimizer == OptimType.PARTIAL_ROWWISE_ADAM:
+            elif self.optimizer in [OptimType.PARTIAL_ROWWISE_ADAM, OptimType.ADAM]:
                 mapping = [
                     (opt_states[0], self.momentum1_dev),
                     (opt_states[1], self.momentum2_dev),
@@ -3025,7 +3492,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def streaming_write_weight_and_id_per_table(
         self,
         weight_state: torch.Tensor,
-        opt_states: List[torch.Tensor],
+        opt_states: list[torch.Tensor],
         id_tensor: torch.Tensor,
         row_offset: int,
     ) -> None:
@@ -3082,7 +3549,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 state_name = self.optimizer.state_names()[o]
                 # Fetch the byte offsets for the optimizer state by its name
-                (start, end) = optimizer_state_byte_offsets[state_name]
+                start, end = optimizer_state_byte_offsets[state_name]
                 # Assume that the opt_state passed in already has dtype matching
                 # self.optimizer_state_dtypes[state_name]
@@ -3119,7 +3586,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.load_state_dict = True
         dtype = self.weights_precision.as_dtype()
-        (_, dims) = zip(*self.embedding_specs)
+        _, dims = zip(*self.embedding_specs)
         self._cached_kvzch_data = KVZCHCachedData([], [], [], [])
@@ -3192,6 +3659,10 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def flush(self, force: bool = False) -> None:
         # allow force flush from split_embedding_weights to cover edge cases, e.g. checkpointing
         # after trained 0 batches
+        if not self.training:
+            # for eval mode, we should not write anything to embedding
+            return
         if self.step == self.last_flush_step and not force:
             logging.info(
                 f"SSD TBE has been flushed at {self.last_flush_step=} already for tbe:{self.tbe_unique_id}"
@@ -3237,18 +3708,20 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         indices: Tensor,
         offsets: Tensor,
         per_sample_weights: Optional[Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
-    ) -> Tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
+    ) -> tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs
         """
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
-            offsets, batch_size_per_feature_per_rank
+            offsets, batch_size_per_feature_per_rank, vbe_output, vbe_output_offsets
         )
         # Force casting indices and offsets to long
-        (indices, offsets) = indices.long(), offsets.long()
+        indices, offsets = indices.long(), offsets.long()
         # Force casting per_sample_weights to float
         if per_sample_weights is not None:
@@ -3287,6 +3760,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self._report_l2_cache_perf_stats()
         if self.backend_type == BackendType.DRAM:
             self._report_dram_kv_perf_stats()
+            if self.kv_zch_params and self.kv_zch_params.eviction_policy:
+                self._report_eviction_stats()
     @torch.jit.ignore
     def _report_ssd_l1_cache_stats(self) -> None:
@@ -3303,7 +3778,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         ssd_cache_stats = self.ssd_cache_stats.tolist()
         if len(self.last_reported_ssd_stats) == 0:
             self.last_reported_ssd_stats = [0.0] * len(ssd_cache_stats)
-        ssd_cache_stats_delta: List[float] = [0.0] * len(ssd_cache_stats)
+        ssd_cache_stats_delta: list[float] = [0.0] * len(ssd_cache_stats)
         for i in range(len(ssd_cache_stats)):
             ssd_cache_stats_delta[i] = (
                 ssd_cache_stats[i] - self.last_reported_ssd_stats[i]
@@ -3553,6 +4028,98 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             time_unit="us",
         )
+    @torch.jit.ignore
+    def _report_eviction_stats(self) -> None:
+        if self.stats_reporter is None:
+            return
+        stats_reporter: TBEStatsReporter = self.stats_reporter
+        if not stats_reporter.should_report(self.step):
+            return
+        # skip metrics reporting when evicting disabled
+        if self.kv_zch_params.eviction_policy.eviction_trigger_mode == 0:
+            return
+        T = len(set(self.feature_table_map))
+        evicted_counts = torch.zeros(T, dtype=torch.int64)
+        processed_counts = torch.zeros(T, dtype=torch.int64)
+        eviction_threshold_with_dry_run = torch.zeros(T, dtype=torch.float)
+        full_duration_ms = torch.tensor(0, dtype=torch.int64)
+        exec_duration_ms = torch.tensor(0, dtype=torch.int64)
+        self.ssd_db.get_feature_evict_metric(
+            evicted_counts,
+            processed_counts,
+            eviction_threshold_with_dry_run,
+            full_duration_ms,
+            exec_duration_ms,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.eviction_sum_evicted_counts_stats_name,
+            data_bytes=int(evicted_counts.sum().item()),
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.eviction_sum_processed_counts_stats_name,
+            data_bytes=int(processed_counts.sum().item()),
+            enable_tb_metrics=True,
+        )
+        if processed_counts.sum().item() != 0:
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name=self.eviction_evict_rate_stats_name,
+                data_bytes=int(
+                    evicted_counts.sum().item() * 100 / processed_counts.sum().item()
+                ),
+                enable_tb_metrics=True,
+            )
+        for t in self.feature_table_map:
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name=f"eviction.feature_table.{t}.evicted_counts",
+                data_bytes=int(evicted_counts[t].item()),
+                enable_tb_metrics=True,
+            )
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name=f"eviction.feature_table.{t}.processed_counts",
+                data_bytes=int(processed_counts[t].item()),
+                enable_tb_metrics=True,
+            )
+            if processed_counts[t].item() != 0:
+                stats_reporter.report_data_amount(
+                    iteration_step=self.step,
+                    event_name=f"eviction.feature_table.{t}.evict_rate",
+                    data_bytes=int(
+                        evicted_counts[t].item() * 100 / processed_counts[t].item()
+                    ),
+                    enable_tb_metrics=True,
+                )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="eviction.feature_table.full_duration_ms",
+            duration_ms=full_duration_ms.item(),
+            time_unit="ms",
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="eviction.feature_table.exec_duration_ms",
+            duration_ms=exec_duration_ms.item(),
+            time_unit="ms",
+            enable_tb_metrics=True,
+        )
+        if full_duration_ms.item() != 0:
+            stats_reporter.report_data_amount(
+                iteration_step=self.step,
+                event_name="eviction.feature_table.exec_div_full_duration_rate",
+                data_bytes=int(exec_duration_ms.item() * 100 / full_duration_ms.item()),
+                enable_tb_metrics=True,
+            )
     @torch.jit.ignore
     def _report_dram_kv_perf_stats(self) -> None:
         """
@@ -3570,8 +4137,8 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.step, stats_reporter.report_interval  # pyre-ignore
         )
-        if len(dram_kv_perf_stats) != 22:
-            logging.error("dram cache perf stats should have 22 elements")
+        if len(dram_kv_perf_stats) != 36:
+            logging.error("dram cache perf stats should have 36 elements")
             return
         dram_read_duration = dram_kv_perf_stats[0]
@@ -3599,52 +4166,75 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         dram_kv_allocated_bytes = dram_kv_perf_stats[20]
         dram_kv_actual_used_chunk_bytes = dram_kv_perf_stats[21]
+        dram_kv_num_rows = dram_kv_perf_stats[22]
+        dram_kv_read_counts = dram_kv_perf_stats[23]
+        dram_metadata_write_sharding_total_duration = dram_kv_perf_stats[24]
+        dram_metadata_write_total_duration = dram_kv_perf_stats[25]
+        dram_metadata_write_allocate_avg_duration = dram_kv_perf_stats[26]
+        dram_metadata_write_lookup_cache_avg_duration = dram_kv_perf_stats[27]
+        dram_metadata_write_acquire_lock_avg_duration = dram_kv_perf_stats[28]
+        dram_metadata_write_cache_miss_avg_count = dram_kv_perf_stats[29]
+        dram_read_metadata_total_duration = dram_kv_perf_stats[30]
+        dram_read_metadata_sharding_total_duration = dram_kv_perf_stats[31]
+        dram_read_metadata_cache_hit_copy_avg_duration = dram_kv_perf_stats[32]
+        dram_read_metadata_lookup_cache_total_avg_duration = dram_kv_perf_stats[33]
+        dram_read_metadata_acquire_lock_avg_duration = dram_kv_perf_stats[34]
+        dram_read_read_metadata_load_size = dram_kv_perf_stats[35]
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.get.dram_read_duration_us",
             duration_ms=dram_read_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.get.dram_read_sharding_duration_us",
             duration_ms=dram_read_sharding_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.get.dram_read_cache_hit_copy_duration_us",
             duration_ms=dram_read_cache_hit_copy_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.get.dram_read_fill_row_storage_duration_us",
             duration_ms=dram_read_fill_row_storage_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.get.dram_read_lookup_cache_duration_us",
             duration_ms=dram_read_lookup_cache_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.get.dram_read_acquire_lock_duration_us",
             duration_ms=dram_read_acquire_lock_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="dram_kv.perf.get.dram_read_missing_load",
+            enable_tb_metrics=True,
             data_bytes=dram_read_missing_load,
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_write_sharing_duration_us",
             duration_ms=dram_write_sharing_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
@@ -3652,83 +4242,192 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_duration_us",
             duration_ms=dram_fwd_l1_eviction_write_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_allocate_duration_us",
             duration_ms=dram_fwd_l1_eviction_write_allocate_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_cache_copy_duration_us",
             duration_ms=dram_fwd_l1_eviction_write_cache_copy_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_lookup_cache_duration_us",
             duration_ms=dram_fwd_l1_eviction_write_lookup_cache_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_acquire_lock_duration_us",
             duration_ms=dram_fwd_l1_eviction_write_acquire_lock_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_fwd_l1_eviction_write_missing_load",
             data_bytes=dram_fwd_l1_eviction_write_missing_load,
+            enable_tb_metrics=True,
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_duration_us",
             duration_ms=dram_bwd_l1_cnflct_miss_write_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_allocate_duration_us",
             duration_ms=dram_bwd_l1_cnflct_miss_write_allocate_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_cache_copy_duration_us",
             duration_ms=dram_bwd_l1_cnflct_miss_write_cache_copy_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_lookup_cache_duration_us",
             duration_ms=dram_bwd_l1_cnflct_miss_write_lookup_cache_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_duration(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_acquire_lock_duration_us",
             duration_ms=dram_bwd_l1_cnflct_miss_write_acquire_lock_duration,
+            enable_tb_metrics=True,
             time_unit="us",
         )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="dram_kv.perf.set.dram_bwd_l1_cnflct_miss_write_missing_load",
             data_bytes=dram_bwd_l1_cnflct_miss_write_missing_load,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_kv_read_counts",
+            data_bytes=dram_kv_read_counts,
+            enable_tb_metrics=True,
         )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name=self.dram_kv_allocated_bytes_stats_name,
             data_bytes=dram_kv_allocated_bytes,
+            enable_tb_metrics=True,
         )
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name=self.dram_kv_actual_used_chunk_bytes_stats_name,
             data_bytes=dram_kv_actual_used_chunk_bytes,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name=self.dram_kv_mem_num_rows_stats_name,
+            data_bytes=dram_kv_num_rows,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_sharding_total_duration_us",
+            duration_ms=dram_metadata_write_sharding_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_total_duration_us",
+            duration_ms=dram_metadata_write_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_allocate_avg_duration_us",
+            duration_ms=dram_metadata_write_allocate_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_lookup_cache_avg_duration_us",
+            duration_ms=dram_metadata_write_lookup_cache_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_acquire_lock_avg_duration_us",
+            duration_ms=dram_metadata_write_acquire_lock_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.set.dram_eviction_score_write_cache_miss_avg_count",
+            data_bytes=dram_metadata_write_cache_miss_avg_count,
+            enable_tb_metrics=True,
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_total_duration_us",
+            duration_ms=dram_read_metadata_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_sharding_total_duration_us",
+            duration_ms=dram_read_metadata_sharding_total_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_cache_hit_copy_avg_duration_us",
+            duration_ms=dram_read_metadata_cache_hit_copy_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_lookup_cache_total_avg_duration_us",
+            duration_ms=dram_read_metadata_lookup_cache_total_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_duration(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_acquire_lock_avg_duration_us",
+            duration_ms=dram_read_metadata_acquire_lock_avg_duration,
+            enable_tb_metrics=True,
+            time_unit="us",
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="dram_kv.perf.get.dram_eviction_score_read_load_size",
+            data_bytes=dram_read_read_metadata_load_size,
+            enable_tb_metrics=True,
         )
     def _recording_to_timer(
@@ -3749,7 +4448,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
     def fetch_from_l1_sp_w_row_ids(
         self, row_ids: torch.Tensor, only_get_optimizer_states: bool = False
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[list[torch.Tensor], torch.Tensor]:
         """
         Fetch the optimizer states and/or weights from L1 and SP for given linearized row_ids.
         @return: updated_weights/optimizer_states, mask of which rows are filled
@@ -3762,36 +4461,38 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         # NOTE: Remove this once there is support for fetching multiple
         # optimizer states in fetch_from_l1_sp_w_row_ids
         if only_get_optimizer_states and self.optimizer not in [
-            OptimType.EXACT_ROWWISE_ADAGRAD
+            OptimType.EXACT_ROWWISE_ADAGRAD,
+            OptimType.PARTIAL_ROWWISE_ADAM,
         ]:
             raise RuntimeError(
                 f"Fetching optimizer states using fetch_from_l1_sp_w_row_ids() is not yet supported for {self.optimizer}"
             )
-        with torch.no_grad():
-            weights_dtype = self.weights_precision.as_dtype()
-            step = self.step
-            if only_get_optimizer_states:
-                start_pos = pad4(self.max_D)
-                # NOTE: This is a hack to keep fetch_from_l1_sp_w_row_ids working
-                # until it is upgraded to support optimizers with multiple states
-                # and dtypes
-                row_dim = int(
-                    math.ceil(torch.float32.itemsize / weights_dtype.itemsize)
+        def split_results_by_opt_states(
+            updated_weights: torch.Tensor, cache_location_mask: torch.Tensor
+        ) -> tuple[list[torch.Tensor], torch.Tensor]:
+            if not only_get_optimizer_states:
+                return [updated_weights], cache_location_mask
+            # TODO: support mixed dimension case
+            # currently only supports tables with the same max_D dimension
+            opt_to_dim = self.optimizer.byte_offsets_along_row(
+                self.max_D, self.weights_precision, self.optimizer_state_dtypes
+            )
+            updated_opt_states = []
+            for opt_name, dim in opt_to_dim.items():
+                opt_dtype = self.optimizer._extract_dtype(
+                    self.optimizer_state_dtypes, opt_name
                 )
-                result_dtype = torch.float32
-                result_dim = int(
-                    ceil(row_dim / (result_dtype.itemsize / weights_dtype.itemsize))
+                updated_opt_states.append(
+                    updated_weights.view(dtype=torch.uint8)[:, dim[0] : dim[1]].view(
+                        dtype=opt_dtype
+                    )
                 )
+            return updated_opt_states, cache_location_mask
-            else:
-                start_pos = 0
-                # get the whole row
-                row_dim = self.cache_row_dim
-                result_dim = row_dim
-                result_dtype = weights_dtype
+        with torch.no_grad():
+            weights_dtype = self.weights_precision.as_dtype()
+            step = self.step
             with record_function(f"## fetch_from_l1_{step}_{self.tbe_unique_id} ##"):
                 lxu_cache_locations: torch.Tensor = torch.ops.fbgemm.lxu_cache_lookup(
                     row_ids,
@@ -3800,17 +4501,23 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 )
                 updated_weights = torch.empty(
                     row_ids.numel(),
-                    result_dim,
+                    self.cache_row_dim,
                     device=self.current_device,
-                    dtype=result_dtype,
+                    dtype=weights_dtype,
                 )
                 # D2D copy cache
                 cache_location_mask = lxu_cache_locations >= 0
-                updated_weights[cache_location_mask] = self.lxu_cache_weights[
-                    lxu_cache_locations[cache_location_mask],
-                    start_pos : start_pos + row_dim,
-                ].view(result_dtype)
+                torch.ops.fbgemm.masked_index_select(
+                    updated_weights,
+                    lxu_cache_locations,
+                    self.lxu_cache_weights,
+                    torch.tensor(
+                        [row_ids.numel()],
+                        device=self.current_device,
+                        dtype=torch.int32,
+                    ),
+                )
             with record_function(f"## fetch_from_sp_{step}_{self.tbe_unique_id} ##"):
                 if len(self.ssd_scratch_pad_eviction_data) > 0:
@@ -3821,7 +4528,9 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     actions_count_gpu = self.ssd_scratch_pad_eviction_data[0][2][0]
                     if actions_count_gpu.item() == 0:
                         # no action to take
-                        return (updated_weights, cache_location_mask)
+                        return split_results_by_opt_states(
+                            updated_weights, cache_location_mask
+                        )
                     sp_idx = sp_idx[:actions_count_gpu]
@@ -3872,16 +4581,23 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     )
                     # D2D copy SP
-                    updated_weights[exact_match_mask] = sp[
-                        sp_locations_found, start_pos : start_pos + row_dim
-                    ].view(result_dtype)
+                    torch.ops.fbgemm.masked_index_select(
+                        updated_weights,
+                        sp_locations_in_updated_weights,
+                        sp,
+                        torch.tensor(
+                            [row_ids.numel()],
+                            device=self.current_device,
+                            dtype=torch.int32,
+                        ),
+                    )
                     # cache_location_mask is the mask of rows in L1
                     # exact_match_mask is the mask of rows in SP
                     cache_location_mask = torch.logical_or(
                         cache_location_mask, exact_match_mask
                     )
-            return (updated_weights, cache_location_mask)
+            return split_results_by_opt_states(updated_weights, cache_location_mask)
     def register_backward_hook_before_eviction(
         self, backward_hook: Callable[[torch.Tensor], None]
@@ -3901,3 +4617,312 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
         self.placeholder_autograd_tensor.register_hook(backward_hook)
         for hook in hooks:
             self.placeholder_autograd_tensor.register_hook(hook)
+    def set_local_weight_counts_for_table(
+        self, table_idx: int, weight_count: int
+    ) -> None:
+        self.local_weight_counts[table_idx] = weight_count
+    def set_global_id_per_rank_for_table(
+        self, table_idx: int, global_id: torch.Tensor
+    ) -> None:
+        self.global_id_per_rank[table_idx] = global_id
+    def direct_write_embedding(
+        self,
+        indices: torch.Tensor,
+        offsets: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> None:
+        """
+        Directly write the weights to L1, SP and backend without relying on auto-gradient for embedding cache.
+        Please refer to design doc for more details: https://docs.google.com/document/d/1TJHKvO1m3-5tYAKZGhacXnGk7iCNAzz7wQlrFbX_LDI/edit?tab=t.0
+        """
+        assert (
+            self._embedding_cache_mode
+        ), "Must be in embedding_cache_mode to support direct_write_embedding method."
+        B_offsets = None
+        max_B = -1
+        with torch.no_grad():
+            # Wait for any ongoing prefetch operations to complete before starting direct_write
+            current_stream = torch.cuda.current_stream()
+            current_stream.wait_event(self.prefetch_complete_event)
+            # Create local step events for internal sequential execution
+            weights_dtype = self.weights_precision.as_dtype()
+            assert (
+                weights_dtype == weights.dtype
+            ), f"Expected embedding table dtype {weights_dtype} is same with input weight dtype, but got {weights.dtype}"
+            # Pad the weights to match self.max_D width if necessary
+            if weights.size(1) < self.cache_row_dim:
+                weights = torch.nn.functional.pad(
+                    weights, (0, self.cache_row_dim - weights.size(1))
+                )
+            step = self.step
+            # step 0: run backward hook for prefetch if prefetch pipeline is enabled before writing to L1 and SP
+            if self.prefetch_pipeline:
+                self._update_cache_counter_and_pointers(nn.Module(), torch.empty(0))
+            # step 1: lookup and write to l1 cache
+            with record_function(
+                f"## direct_write_to_l1_{step}_{self.tbe_unique_id} ##"
+            ):
+                if self.gather_ssd_cache_stats:
+                    self.local_ssd_cache_stats.zero_()
+                # Linearize indices
+                linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
+                    self.hash_size_cumsum,
+                    indices,
+                    offsets,
+                    B_offsets,
+                    max_B,
+                )
+                lxu_cache_locations: torch.Tensor = torch.ops.fbgemm.lxu_cache_lookup(
+                    linear_cache_indices,
+                    self.lxu_cache_state,
+                    self.total_hash_size,
+                )
+                cache_location_mask = lxu_cache_locations >= 0
+                # Get the cache locations for the row_ids that are already in the cache
+                cache_locations = lxu_cache_locations[cache_location_mask]
+                # Get the corresponding input weights for these row_ids
+                cache_weights = weights[cache_location_mask]
+                # Update the cache with these input weights
+                if cache_locations.numel() > 0:
+                    self.lxu_cache_weights.index_put_(
+                        (cache_locations,), cache_weights, accumulate=False
+                    )
+                # Record completion of step 1
+                current_stream.record_event(self.direct_write_l1_complete_event)
+            # step 2: pop the current scratch pad and write to next batch scratch pad if exists
+            # Wait for step 1 to complete
+            with record_function(
+                f"## direct_write_to_sp_{step}_{self.tbe_unique_id} ##"
+            ):
+                if len(self.ssd_scratch_pad_eviction_data) > 0:
+                    self.ssd_scratch_pad_eviction_data.pop(0)
+                    if len(self.ssd_scratch_pad_eviction_data) > 0:
+                        # Wait for any pending backend reads to the next scratch pad
+                        # to complete before we write to it. Otherwise, stale backend data
+                        # will overwrite our direct_write updates.
+                        # The ssd_event_get marks completion of backend fetch operations.
+                        current_stream.wait_event(self.ssd_event_get)
+                        # if scratch pad exists, write to next batch scratch pad
+                        sp = self.ssd_scratch_pad_eviction_data[0][0]
+                        sp_idx = self.ssd_scratch_pad_eviction_data[0][1].to(
+                            self.current_device
+                        )
+                        actions_count_gpu = self.ssd_scratch_pad_eviction_data[0][2][0]
+                        if actions_count_gpu.item() != 0:
+                            # when no actional_count_gpu, no need to write to SP
+                            sp_idx = sp_idx[:actions_count_gpu]
+                            # -1 in lxu_cache_locations means the row is not in L1 cache and in SP
+                            # fill the row_ids in L1 with -2, >0 values means in SP or backend
+                            # @eg. updated_indices_in_sp= [1, 100, 1, 2, -2, 3, 4, 5, 10]
+                            updated_indices_in_sp = linear_cache_indices.masked_fill(
+                                lxu_cache_locations != -1, -2
+                            )
+                            # sort the sp_idx for binary search
+                            # should already be sorted
+                            # sp_idx_inverse_indices is the indices before sorting which is same to the location in SP.
+                            # @eg. sp_idx = [4, 2, 1, 3, 10]
+                            # @eg sorted_sp_idx = [ 1,  2,  3,  4, 10] and sp_idx_inverse_indices = [2, 1, 3, 0, 4]
+                            sorted_sp_idx, sp_idx_inverse_indices = torch.sort(sp_idx)
+                            # search rows id in sp against the SP indexes to find location of the rows in SP
+                            # @eg: updated_indices_in_sp = [0, 5, 0, 1, 0, 2, 3, 4, 4]
+                            # @eg: 5 is OOB
+                            updated_indices_in_sp_idx = torch.searchsorted(
+                                sorted_sp_idx, updated_indices_in_sp
+                            )
+                            # does not found in SP will Out of Bound
+                            oob_sp_idx = updated_indices_in_sp_idx >= sp_idx.numel()
+                            # make the oob items in bound
+                            # @eg updated_indices_in_sp=[0, 0, 0, 1, 0, 2, 3, 4, 4]
+                            updated_indices_in_sp_idx[oob_sp_idx] = 0
+                            # torch.searchsorted is not exact match,
+                            # we only take exact matched rows, where the id is found in SP.
+                            # @eg 5 in updated_indices_in_sp is not in sp_idx, but has 4 in updated_indices_in_sp
+                            # @eg sorted_sp_idx[updated_indices_in_sp]=[ 1,  1,  1,  2,  1,  3,  4, 10, 10]
+                            # @eg exact_match_mask=[ True, False,  True,  True, False,  True,  True, False,  True]
+                            exact_match_mask = (
+                                sorted_sp_idx[updated_indices_in_sp_idx]
+                                == updated_indices_in_sp
+                            )
+                            # Get the location of the row ids found in SP.
+                            # @eg: sp_locations_found=[2, 2, 1, 3, 0, 4]
+                            sp_locations_found = sp_idx_inverse_indices[
+                                updated_indices_in_sp[exact_match_mask]
+                            ]
+                            # Get the corresponding weights for the matched indices
+                            matched_weights = weights[exact_match_mask]
+                            # Write the weights to the sparse tensor at the found locations
+                            if sp_locations_found.numel() > 0:
+                                sp.index_put_(
+                                    (sp_locations_found,),
+                                    matched_weights,
+                                    accumulate=False,
+                                )
+                current_stream.record_event(self.direct_write_sp_complete_event)
+            # step 3: write l1 cache missing rows to backend
+            # Wait for step 2 to complete
+            with record_function(
+                f"## direct_write_to_backend_{step}_{self.tbe_unique_id} ##"
+            ):
+                # Use the existing ssd_eviction_stream for all backend write operations
+                # This stream is already created with low priority during initialization
+                with torch.cuda.stream(self.ssd_eviction_stream):
+                    # Create a mask for indices not in L1 cache
+                    non_cache_mask = ~cache_location_mask
+                    # Calculate the count of valid indices (those not in L1 cache)
+                    valid_count = non_cache_mask.sum().to(torch.int64).cpu()
+                    if valid_count.item() > 0:
+                        # Extract only the indices and weights that are not in L1 cache
+                        non_cache_indices = linear_cache_indices[non_cache_mask]
+                        non_cache_weights = weights[non_cache_mask]
+                        # Move tensors to CPU for set_cuda
+                        cpu_indices = non_cache_indices.cpu()
+                        cpu_weights = non_cache_weights.cpu()
+                        # Write to backend - only sending the non-cache indices and weights
+                        self.record_function_via_dummy_profile(
+                            f"## ssd_write_{step}_set_cuda_{self.tbe_unique_id} ##",
+                            self.ssd_db.set_cuda,
+                            cpu_indices,
+                            cpu_weights,
+                            valid_count,
+                            self.timestep,
+                            is_bwd=False,
+                        )
+                # Return control to the main stream without waiting for the backend operation to complete
+    def get_free_cpu_memory_gb(self) -> float:
+        def _get_mem_available() -> float:
+            if sys.platform.startswith("linux"):
+                info = {}
+                with open("/proc/meminfo") as f:
+                    for line in f:
+                        p = line.split()
+                        info[p[0].strip(":").lower()] = int(p[1]) * 1024
+                if "memavailable" in info:
+                    # Linux >= 3.14
+                    return info["memavailable"]
+                else:
+                    return info["memfree"] + info["cached"]
+            else:
+                raise RuntimeError(
+                    "Unsupported platform for free memory eviction, pls use ID count eviction tirgger mode"
+                )
+        mem = _get_mem_available()
+        return mem / (1024**3)
+    @classmethod
+    def trigger_evict_in_all_tbes(cls) -> None:
+        for tbe in cls._all_tbe_instances:
+            tbe.ssd_db.trigger_feature_evict()
+    @classmethod
+    def tbe_has_ongoing_eviction(cls) -> bool:
+        for tbe in cls._all_tbe_instances:
+            if tbe.ssd_db.is_evicting():
+                return True
+        return False
+    def set_free_mem_eviction_trigger_config(
+        self, eviction_policy: EvictionPolicy
+    ) -> None:
+        self.enable_free_mem_trigger_eviction = True
+        self.eviction_trigger_mode: int = eviction_policy.eviction_trigger_mode
+        assert (
+            eviction_policy.eviction_free_mem_check_interval_batch is not None
+        ), "eviction_free_mem_check_interval_batch is unexpected none for free_mem eviction trigger mode"
+        self.eviction_free_mem_check_interval_batch: int = (
+            eviction_policy.eviction_free_mem_check_interval_batch
+        )
+        assert (
+            eviction_policy.eviction_free_mem_threshold_gb is not None
+        ), "eviction_policy.eviction_free_mem_threshold_gb is unexpected none for free_mem eviction trigger mode"
+        self.eviction_free_mem_threshold_gb: int = (
+            eviction_policy.eviction_free_mem_threshold_gb
+        )
+        logging.info(
+            f"[FREE_MEM Eviction] eviction config, trigger model: FREE_MEM, {self.eviction_free_mem_check_interval_batch=}, {self.eviction_free_mem_threshold_gb=}"
+        )
+    def may_trigger_eviction(self) -> None:
+        def is_first_tbe() -> bool:
+            first = SSDTableBatchedEmbeddingBags._first_instance_ref
+            return first is not None and first() is self
+        # We assume that the eviction time is less than free mem check interval time
+        # So every time we reach this check, all evictions in all tbes should be finished.
+        # We only need to check the first tbe because all tbes share the same free mem,
+        # once the first tbe detect need to trigger eviction, it will call trigger func
+        # in all tbes from _all_tbe_instances
+        if (
+            self.enable_free_mem_trigger_eviction
+            and self.step % self.eviction_free_mem_check_interval_batch == 0
+            and self.training
+            and is_first_tbe()
+        ):
+            if not SSDTableBatchedEmbeddingBags.tbe_has_ongoing_eviction():
+                SSDTableBatchedEmbeddingBags._eviction_triggered = False
+            free_cpu_mem_gb = self.get_free_cpu_memory_gb()
+            local_evict_trigger = int(
+                free_cpu_mem_gb < self.eviction_free_mem_threshold_gb
+            )
+            tensor_flag = torch.tensor(
+                local_evict_trigger,
+                device=self.current_device,
+                dtype=torch.int,
+            )
+            world_size = dist.get_world_size(self._pg)
+            if world_size > 1:
+                dist.all_reduce(tensor_flag, op=dist.ReduceOp.SUM, group=self._pg)
+                global_evict_trigger = tensor_flag.item()
+            else:
+                global_evict_trigger = local_evict_trigger
+            if (
+                global_evict_trigger >= 1
+                and SSDTableBatchedEmbeddingBags._eviction_triggered
+            ):
+                logging.warning(
+                    f"[FREE_MEM Eviction] {global_evict_trigger} ranks triggered eviction, but SSDTableBatchedEmbeddingBags._eviction_triggered is true"
+                )
+            if (
+                global_evict_trigger >= 1
+                and not SSDTableBatchedEmbeddingBags._eviction_triggered
+            ):
+                SSDTableBatchedEmbeddingBags._eviction_triggered = True
+                SSDTableBatchedEmbeddingBags.trigger_evict_in_all_tbes()
+                logging.info(
+                    f"[FREE_MEM Eviction] Evict all at batch {self.step}, {free_cpu_mem_gb} GB free CPU memory, {global_evict_trigger} ranks triggered eviction"
+                )
+    def reset_inference_mode(self) -> None:
+        """
+        Reset the inference mode
+        """
+        self.eval()