PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

fbgemm_gpu/__init__.py +112 -19
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +118 -0
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +190 -54
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
fbgemm_gpu/split_embedding_configs.py +134 -37
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +6 -1
fbgemm_gpu/tbe/bench/bench_config.py +14 -3
fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
fbgemm_gpu/tbe/ssd/common.py +1 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +1292 -267
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +15 -15
fbgemm_gpu/tbe_input_multiplexer.py +10 -11
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +6 -2
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +1 -0
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/split_table_batched_embeddings_ops_training.py CHANGED Viewed

@@ -18,14 +18,14 @@ import uuid
 from dataclasses import dataclass, field
 from itertools import accumulate
 from math import log2
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Optional, Union
 import torch  # usort:skip
 from torch import nn, Tensor  # usort:skip
+from torch.autograd.profiler import record_function  # usort:skip
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
 from fbgemm_gpu.config import FeatureGate, FeatureGateName
 from fbgemm_gpu.runtime_monitor import (
     AsyncSeriesTimer,
@@ -48,6 +48,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     SplitState,
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
+    check_allocated_vbe_output,
     generate_vbe_metadata,
     is_torchdynamo_compiling,
 )
@@ -57,8 +58,8 @@ from fbgemm_gpu.tbe_input_multiplexer import (
     TBEInputMultiplexer,
     TBEInputMultiplexerConfig,
 )
 from fbgemm_gpu.utils.loader import load_torch_module, load_torch_module_bc
+from fbgemm_gpu.utils.writeback_util import writeback_gradient
 try:
     load_torch_module(
@@ -158,6 +159,7 @@ class UserEnabledConfigDefinition:
     # More details can be found in D64848802.
     use_rowwise_bias_correction: bool = False
     use_writeback_bwd_prehook: bool = False
+    writeback_first_feature_only: bool = False
 @dataclass(frozen=True)
@@ -190,25 +192,48 @@ class UVMCacheStatsIndex(enum.IntEnum):
 class RESParams:
     res_server_port: int = 0  # the port of the res server
     res_store_shards: int = 1  # the number of shards to store the raw embeddings
-    table_names: List[str] = field(default_factory=list)  # table names the TBE holds
-    table_offsets: List[int] = field(
+    table_names: list[str] = field(default_factory=list)  # table names the TBE holds
+    table_offsets: list[int] = field(
         default_factory=list
     )  # table offsets for the global rows the TBE holds
-    table_sizes: List[int] = field(
+    table_sizes: list[int] = field(
         default_factory=list
     )  # table sizes for the global rows the TBE holds
+class PrefetchedInfo:
+    """
+    Container for prefetched cache information.
+    This class is explicitly defined (not using @dataclass) to be compatible with
+    TorchScript's inspect.getsource() requirements.
+    """
+    def __init__(
+        self,
+        linear_unique_indices: torch.Tensor,
+        linear_unique_cache_indices: torch.Tensor,
+        linear_unique_indices_length: torch.Tensor,
+        hash_zch_identities: Optional[torch.Tensor],
+        hash_zch_runtime_meta: Optional[torch.Tensor],
+    ) -> None:
+        self.linear_unique_indices = linear_unique_indices
+        self.linear_unique_cache_indices = linear_unique_cache_indices
+        self.linear_unique_indices_length = linear_unique_indices_length
+        self.hash_zch_identities = hash_zch_identities
+        self.hash_zch_runtime_meta = hash_zch_runtime_meta
 def construct_split_state(
-    embedding_specs: List[Tuple[int, int, EmbeddingLocation, ComputeDevice]],
+    embedding_specs: list[tuple[int, int, EmbeddingLocation, ComputeDevice]],
     rowwise: bool,
     cacheable: bool,
     precision: SparseType = SparseType.FP32,
     int8_emb_row_dim_offset: int = INT8_EMB_ROW_DIM_OFFSET,
     placement: Optional[EmbeddingLocation] = None,
 ) -> SplitState:
-    placements: List[EmbeddingLocation] = []
-    offsets: List[int] = []
+    placements: list[EmbeddingLocation] = []
+    offsets: list[int] = []
     dev_size: int = 0
     host_size: int = 0
     uvm_size: int = 0
@@ -250,18 +275,18 @@ def construct_split_state(
 def apply_split_helper(
     persistent_state_fn: Callable[[str, Tensor], None],
     set_attr_fn: Callable[
-        [str, Union[Tensor, List[int], List[EmbeddingLocation]]], None
+        [str, Union[Tensor, list[int], list[EmbeddingLocation]]], None
     ],
     current_device: torch.device,
     use_cpu: bool,
-    feature_table_map: List[int],
+    feature_table_map: list[int],
     split: SplitState,
     prefix: str,
-    dtype: Type[torch.dtype],
+    dtype: type[torch.dtype],
     enforce_hbm: bool = False,
     make_dev_param: bool = False,
-    dev_reshape: Optional[Tuple[int, ...]] = None,
-    uvm_tensors_log: Optional[List[str]] = None,
+    dev_reshape: Optional[tuple[int, ...]] = None,
+    uvm_tensors_log: Optional[list[str]] = None,
     uvm_host_mapped: bool = False,
 ) -> None:
     set_attr_fn(f"{prefix}_physical_placements", split.placements)
@@ -346,6 +371,7 @@ def apply_split_helper(
                 f"{prefix}_uvm",
                 torch.zeros(
                     split.uvm_size,
+                    device=current_device,
                     out=torch.ops.fbgemm.new_unified_tensor(
                         # pyre-fixme[6]: Expected `Optional[Type[torch._dtype]]`
                         #  for 3rd param but got `Type[Type[torch._dtype]]`.
@@ -621,11 +647,12 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             (preshard_table_height, preshard_table_dim, height_offset, dim_offset)
     """
-    embedding_specs: List[Tuple[int, int, EmbeddingLocation, ComputeDevice]]
+    embedding_specs: list[tuple[int, int, EmbeddingLocation, ComputeDevice]]
     optimizer_args: invokers.lookup_args.OptimizerArgs
-    lxu_cache_locations_list: List[Tensor]
+    lxu_cache_locations_list: list[Tensor]
     lxu_cache_locations_empty: Tensor
-    timesteps_prefetched: List[int]
+    timesteps_prefetched: list[int]
+    prefetched_info_list: list[PrefetchedInfo]
     record_cache_metrics: RecordCacheMetrics
     # pyre-fixme[13]: Attribute `uvm_cache_stats` is never initialized.
     uvm_cache_stats: torch.Tensor
@@ -639,10 +666,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     def __init__(  # noqa C901
         self,
-        embedding_specs: List[
-            Tuple[int, int, EmbeddingLocation, ComputeDevice]
+        embedding_specs: list[
+            tuple[int, int, EmbeddingLocation, ComputeDevice]
         ],  # tuple of (rows, dims, placements, compute_devices)
-        feature_table_map: Optional[List[int]] = None,  # [T]
+        feature_table_map: Optional[list[int]] = None,  # [T]
         cache_algorithm: CacheAlgorithm = CacheAlgorithm.LRU,
         cache_load_factor: float = 0.2,
         cache_sets: int = 0,
@@ -680,8 +707,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         use_experimental_tbe: bool = False,
         prefetch_pipeline: bool = False,
         stats_reporter_config: Optional[TBEStatsReporterConfig] = None,
-        table_names: Optional[List[str]] = None,
-        optimizer_state_dtypes: Optional[Dict[str, SparseType]] = None,
+        table_names: Optional[list[str]] = None,
+        optimizer_state_dtypes: Optional[dict[str, SparseType]] = None,
         multipass_prefetch_config: Optional[MultiPassPrefetchConfig] = None,
         global_weight_decay: Optional[GlobalWeightDecayDefinition] = None,
         uvm_host_mapped: bool = False,
@@ -689,7 +716,9 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         tbe_input_multiplexer_config: Optional[TBEInputMultiplexerConfig] = None,
         embedding_table_index_type: torch.dtype = torch.int64,
         embedding_table_offset_type: torch.dtype = torch.int64,
-        embedding_shard_info: Optional[List[Tuple[int, int, int, int]]] = None,
+        embedding_shard_info: Optional[list[tuple[int, int, int, int]]] = None,
+        enable_raw_embedding_streaming: bool = False,
+        res_params: Optional[RESParams] = None,
     ) -> None:
         super(SplitTableBatchedEmbeddingBagsCodegen, self).__init__()
         self.uuid = str(uuid.uuid4())
@@ -699,7 +728,9 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             f"Feature Gates: {[(feature.name, feature.is_enabled()) for feature in FeatureGateName]}"
         )
+        self.table_names: Optional[list[str]] = table_names
         self.logging_table_name: str = self.get_table_name_for_logging(table_names)
+        self.enable_raw_embedding_streaming: bool = enable_raw_embedding_streaming
         self.pooling_mode = pooling_mode
         self.is_nobag: bool = self.pooling_mode == PoolingMode.NONE
@@ -793,9 +824,9 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             ), "Unique cache miss counters are not accurate in multipass prefetch and therefore not supported"
         self.embedding_specs = embedding_specs
-        (rows, dims, locations, compute_devices) = zip(*embedding_specs)
+        rows, dims, locations, compute_devices = zip(*embedding_specs)
         T_ = len(self.embedding_specs)
-        self.dims: List[int] = dims
+        self.dims: list[int] = dims
         assert T_ > 0
         # mixed D is not supported by no bag kernels
         mixed_D = False
@@ -808,7 +839,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             assert (
                 self.pooling_mode != PoolingMode.NONE
             ), "Mixed dimension tables only supported for pooling tables."
+        self.mixed_D: bool = mixed_D
         assert all(
             cd == compute_devices[0] for cd in compute_devices
         ), "Heterogenous compute_devices are NOT supported!"
@@ -872,7 +903,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self.stats_reporter: Optional[TBEStatsReporter] = (
             stats_reporter_config.create_reporter() if stats_reporter_config else None
         )
-        self._uvm_tensors_log: List[str] = []
+        self._uvm_tensors_log: list[str] = []
         self.bwd_wait_prefetch_timer: Optional[AsyncSeriesTimer] = None
         self.prefetch_duration_timer: Optional[AsyncSeriesTimer] = None
@@ -899,12 +930,12 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self.int8_emb_row_dim_offset: int = INT8_EMB_ROW_DIM_OFFSET
-        self.feature_table_map: List[int] = (
+        self.feature_table_map: list[int] = (
             feature_table_map if feature_table_map is not None else list(range(T_))
         )
         if embedding_shard_info:
-            (full_table_heights, full_table_dims, row_offset, col_offset) = zip(
+            full_table_heights, full_table_dims, row_offset, col_offset = zip(
                 *embedding_shard_info
             )
         else:
@@ -939,7 +970,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         table_has_feature = [False] * T_
         for t in self.feature_table_map:
             table_has_feature[t] = True
-        assert all(table_has_feature), "Each table must have at least one feature!"
+        assert all(table_has_feature), (
+            "Each table must have at least one feature!"
+            + f"{[(i, x) for i, x in enumerate(table_has_feature)]}"
+        )
         feature_dims = [dims[t] for t in self.feature_table_map]
         D_offsets = [0] + list(accumulate(feature_dims))
@@ -991,7 +1025,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             "feature_dims",
             torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
         )
-        (_info_B_num_bits, _info_B_mask) = torch.ops.fbgemm.get_infos_metadata(
+        _info_B_num_bits, _info_B_mask = torch.ops.fbgemm.get_infos_metadata(
             self.D_offsets,  # unused tensor
             1,  # max_B
             T,  # T
@@ -1105,13 +1139,13 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         if ensemble_mode is None:
             ensemble_mode = EnsembleModeDefinition()
-        self._ensemble_mode: Dict[str, float] = {
+        self._ensemble_mode: dict[str, float] = {
             key: float(fval) for key, fval in ensemble_mode.__dict__.items()
         }
         if emainplace_mode is None:
             emainplace_mode = EmainplaceModeDefinition()
-        self._emainplace_mode: Dict[str, float] = {
+        self._emainplace_mode: dict[str, float] = {
             key: float(fval) for key, fval in emainplace_mode.__dict__.items()
         }
@@ -1151,6 +1185,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self.use_writeback_bwd_prehook: bool = (
             extra_optimizer_config.use_writeback_bwd_prehook
         )
+        writeback_first_feature_only: bool = (
+            extra_optimizer_config.writeback_first_feature_only
+        )
         self.log(f"self.extra_optimizer_config is {extra_optimizer_config}")
         if self.use_rowwise_bias_correction and not self.optimizer == OptimType.ADAM:
             raise AssertionError(
@@ -1416,7 +1454,11 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self.step = 0
         self.last_reported_step = 0
-        self.last_reported_uvm_stats: List[float] = []
+        self.last_reported_uvm_stats: list[float] = []
+        # Track number of times detailed memory breakdown has been reported
+        self.detailed_mem_breakdown_report_count = 0
+        # Set max number of reports for detailed memory breakdown
+        self.max_detailed_mem_breakdown_reports = 10
         # Check whether to use TBE v2
         is_experimental = False
@@ -1435,16 +1477,22 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         #     self.log("TBE_V2 Knob is set to True; Using experimental TBE")
         self.is_experimental: bool = is_experimental
+        self._writeback_first_feature_only: bool = writeback_first_feature_only
         # Get a debug function pointer
         self._debug_print_input_stats: Callable[..., None] = (
             self._debug_print_input_stats_factory()
         )
+        # Get a reporter function pointer
+        self._report_input_params: Callable[..., None] = (
+            self.__report_input_params_factory()
+        )
         if optimizer == OptimType.EXACT_SGD and self.use_writeback_bwd_prehook:
             # Register writeback hook for Exact_SGD optimizer
             self.log(
-                "SplitTableBatchedEmbeddingBagsCodegen:  use_writeback_bwd_prehook is enabled."
+                f"SplitTableBatchedEmbeddingBagsCodegen:  use_writeback_bwd_prehook is enabled with first feature only={self._writeback_first_feature_only}"
             )
             # pyre-fixme[6]: Expected `typing.Callable[[Module, Union[Tensor, typing.Tuple[Tensor, ...]]], Union[None, Tensor, typing.Tuple[Tensor, ...]]]`
             self.register_full_backward_pre_hook(self.writeback_hook)
@@ -1460,6 +1508,30 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             )
         self.embedding_table_offset_type: torch.dtype = embedding_table_offset_type
+        self.prefetched_info_list: list[PrefetchedInfo] = torch.jit.annotate(
+            list[PrefetchedInfo], []
+        )
+        if self.enable_raw_embedding_streaming:
+            self.res_params: RESParams = res_params or RESParams()
+            self.res_params.table_sizes = [0] + list(accumulate(rows))
+            res_port_from_env = os.getenv("LOCAL_RES_PORT")
+            self.res_params.res_server_port = (
+                int(res_port_from_env) if res_port_from_env else 0
+            )
+            # pyre-fixme[4]: Attribute must be annotated.
+            self._raw_embedding_streamer = torch.classes.fbgemm.RawEmbeddingStreamer(
+                self.uuid,
+                self.enable_raw_embedding_streaming,
+                self.res_params.res_store_shards,
+                self.res_params.res_server_port,
+                self.res_params.table_names,
+                self.res_params.table_offsets,
+                self.res_params.table_sizes,
+            )
+            logging.info(
+                f"{self.uuid} raw embedding streaming enabled with {self.res_params=}"
+            )
     @torch.jit.ignore
     def log(self, msg: str) -> None:
         """
@@ -1503,7 +1575,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         )
     @staticmethod
-    def get_table_name_for_logging(table_names: Optional[List[str]]) -> str:
+    def get_table_name_for_logging(table_names: Optional[list[str]]) -> str:
         """
         Given a list of all table names in the TBE, generate a string to
         represent them in logging. If there is more than one table, this method
@@ -1519,17 +1591,17 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             return "<Unknown>"
         # Do this because sometimes multiple shards of the same table could appear
         # in one TBE.
-        table_name_set = set(table_names)
+        table_name_set = sorted(set(table_names))
         if len(table_name_set) == 1:
             return next(iter(table_name_set))
-        return f"<{len(table_name_set)} tables>"
+        return f"<{len(table_name_set)} tables>: {table_name_set}"
     @staticmethod
     def get_prefetch_passes(
         multipass_prefetch_config: Optional[MultiPassPrefetchConfig],
         input_tensor: Tensor,
         output_tensor: Tensor,
-    ) -> List[Tuple[Tensor, Tensor, int]]:
+    ) -> list[tuple[Tensor, Tensor, int]]:
         """
         Given inputs (the indices to forward), partition the input and output
         into smaller chunks and return them as a list of tuples
@@ -1577,7 +1649,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             )
         )
-    def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    def get_states(self, prefix: str) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
         """
         Get a state of a given tensor (`prefix`)
@@ -1616,7 +1688,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             torch.tensor(offsets, dtype=torch.int64),
         )
-    def get_all_states(self) -> List[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]:
+    def get_all_states(self) -> list[tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]:
         """
         Get all states in the TBE (`weights`, `momentum1`, `momentum2`,
         `prev_iter`, and `row_counter`)
@@ -1680,10 +1752,161 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             tbe_id=self.uuid,
         )
-    @torch.jit.ignore
-    def _report_tbe_mem_usage(
+    def _get_tensor_memory(self, tensor_name: str) -> int:
+        """Get memory usage of a tensor in bytes."""
+        if not hasattr(self, tensor_name):
+            self.log(f"Tensor '{tensor_name}' not found, using 0 bytes")
+            return 0
+        tensor = getattr(self, tensor_name)
+        return tensor.numel() * tensor.element_size()
+    def _categorize_memory_by_location(
+        self, tensor_names: list[str]
+    ) -> tuple[int, int]:
+        """Categorize memory into HBM and UVM for given tensors.
+        Returns:
+            (hbm_bytes, uvm_bytes)
+        """
+        uvm_set = set(self._uvm_tensors_log)
+        hbm_bytes = 0
+        uvm_bytes = 0
+        for name in tensor_names:
+            size = self._get_tensor_memory(name)
+            if name in uvm_set:
+                uvm_bytes += size
+            else:
+                hbm_bytes += size
+        return hbm_bytes, uvm_bytes
+    def _report_hbm_breakdown(
+        self,
+        stats_reporter: TBEStatsReporter,
+        embeddings: int,
+        optimizer_states: int,
+        cache: int,
+        total_static_sparse: int,
+        ephemeral: int,
+        cache_weights: int = 0,
+        cache_aux: int = 0,
+    ) -> None:
+        """Report HBM memory breakdown to stats reporter."""
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.embeddings",
+            data_bytes=embeddings,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.optimizer_states",
+            data_bytes=optimizer_states,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.cache",
+            data_bytes=cache,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.cache_weights",
+            data_bytes=cache_weights,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.cache_aux",
+            data_bytes=cache_aux,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.total_static_sparse",
+            data_bytes=total_static_sparse,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.hbm.ephemeral",
+            data_bytes=ephemeral,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+    def _report_uvm_breakdown(
         self,
+        stats_reporter: TBEStatsReporter,
+        embeddings: int,
+        optimizer_states: int,
+        cache: int,
+        total_static_sparse: int,
+        ephemeral: int,
+        cache_weights: int = 0,
+        cache_aux: int = 0,
     ) -> None:
+        """Report UVM memory breakdown to stats reporter."""
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.embeddings",
+            data_bytes=embeddings,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.optimizer_states",
+            data_bytes=optimizer_states,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.cache",
+            data_bytes=cache,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.cache_weights",
+            data_bytes=cache_weights,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.cache_aux",
+            data_bytes=cache_aux,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.total_static_sparse",
+            data_bytes=total_static_sparse,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+        stats_reporter.report_data_amount(
+            iteration_step=self.step,
+            event_name="tbe.uvm.ephemeral",
+            data_bytes=ephemeral,
+            embedding_id=self.logging_table_name,
+            tbe_id=self.uuid,
+        )
+    @torch.jit.ignore
+    def _report_tbe_mem_usage(self) -> None:
         if self.stats_reporter is None:
             return
@@ -1691,22 +1914,24 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         if not stats_reporter.should_report(self.step):
             return
+        # Calculate total memory from all parameters and buffers (always needed)
         total_mem_usage = sum(
-            param.numel() * param.element_size() for param in self.parameters()
-        ) + sum(buffer.numel() * buffer.element_size() for buffer in self.buffers())
+            p.numel() * p.element_size() for p in self.parameters()
+        ) + sum(b.numel() * b.element_size() for b in self.buffers())
+        # Calculate total HBM and UVM usage (always needed)
         if self.use_cpu:
             total_hbm_usage = 0
             total_uvm_usage = total_mem_usage
         else:
-            # hbm usage is total usage minus uvm usage
             total_uvm_usage = sum(
-                getattr(self, tensor_name).numel()
-                * getattr(self, tensor_name).element_size()
-                for tensor_name in self._uvm_tensors_log
-                if hasattr(self, tensor_name)
+                self._get_tensor_memory(name)
+                for name in self._uvm_tensors_log
+                if hasattr(self, name)
             )
             total_hbm_usage = total_mem_usage - total_uvm_usage
+        # Report total memory usage metrics (always reported for backward compatibility)
         stats_reporter.report_data_amount(
             iteration_step=self.step,
             event_name="tbe.total_hbm_usage",
@@ -1722,6 +1947,96 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             tbe_id=self.uuid,
         )
+        # Only report detailed breakdown for the first max_detailed_mem_breakdown_reports reportable
+        # steps since static sparse memory (weights, optimizer states, cache) is constant
+        if (
+            self.detailed_mem_breakdown_report_count
+            >= self.max_detailed_mem_breakdown_reports
+        ):
+            return
+        self.detailed_mem_breakdown_report_count += 1
+        # Tensor groups for sparse memory categorization
+        weight_tensors = ["weights_dev", "weights_host", "weights_uvm"]
+        optimizer_tensors = [
+            "momentum1_dev",
+            "momentum1_host",
+            "momentum1_uvm",
+            "momentum2_dev",
+            "momentum2_host",
+            "momentum2_uvm",
+        ]
+        # Cache weights tensor (the actual cached embeddings in HBM)
+        cache_weight_tensors = [
+            "lxu_cache_weights",
+        ]
+        # Cache auxiliary state tensors (metadata for cache management, excluding weights)
+        # Sizes scale with hash_size or cache_slots (hash_size × clf)
+        # Excludes constant-size tensors: cache_hash_size_cumsum, cache_miss_counter, etc.
+        cache_aux_tensors = [
+            "cache_index_table_map",  # int32, 4B × hash_size
+            "lxu_cache_state",  # int64, 8B × cache_slots
+            "lxu_state",  # int64, 8B × cache_slots (LRU) or hash_size (LFU)
+            "lxu_cache_locking_counter",  # int32, 4B × cache_slots (only if prefetch_pipeline)
+        ]
+        # Calculate total memory for each component
+        weights_total = sum(self._get_tensor_memory(t) for t in weight_tensors)
+        optimizer_total = sum(self._get_tensor_memory(t) for t in optimizer_tensors)
+        cache_weights_total = sum(
+            self._get_tensor_memory(t) for t in cache_weight_tensors
+        )
+        cache_aux_total = sum(self._get_tensor_memory(t) for t in cache_aux_tensors)
+        # Categorize memory by location (HBM vs UVM)
+        if self.use_cpu:
+            weights_hbm, weights_uvm = 0, weights_total
+            opt_hbm, opt_uvm = 0, optimizer_total
+            cache_weights_hbm, cache_weights_uvm = 0, cache_weights_total
+            cache_aux_hbm, cache_aux_uvm = 0, cache_aux_total
+        else:
+            weights_hbm, weights_uvm = self._categorize_memory_by_location(
+                weight_tensors
+            )
+            opt_hbm, opt_uvm = self._categorize_memory_by_location(optimizer_tensors)
+            cache_weights_hbm, cache_weights_uvm = self._categorize_memory_by_location(
+                cache_weight_tensors
+            )
+            cache_aux_hbm, cache_aux_uvm = self._categorize_memory_by_location(
+                cache_aux_tensors
+            )
+        # Calculate ephemeral memory split between HBM and UVM
+        # Total cache = cache weights + cache auxiliary state
+        cache_hbm = cache_weights_hbm + cache_aux_hbm
+        cache_uvm = cache_weights_uvm + cache_aux_uvm
+        static_sparse_hbm = weights_hbm + opt_hbm + cache_hbm
+        static_sparse_uvm = weights_uvm + opt_uvm + cache_uvm
+        ephemeral_hbm = total_hbm_usage - static_sparse_hbm
+        ephemeral_uvm = total_uvm_usage - static_sparse_uvm
+        # Report granular memory breakdowns
+        self._report_hbm_breakdown(
+            stats_reporter,
+            weights_hbm,
+            opt_hbm,
+            cache_hbm,
+            static_sparse_hbm,
+            ephemeral_hbm,
+            cache_weights_hbm,
+            cache_aux_hbm,
+        )
+        self._report_uvm_breakdown(
+            stats_reporter,
+            weights_uvm,
+            opt_uvm,
+            cache_uvm,
+            static_sparse_uvm,
+            ephemeral_uvm,
+            cache_weights_uvm,
+            cache_aux_uvm,
+        )
     @torch.jit.ignore
     def _report_io_size_count(self, event: str, data: Tensor) -> Tensor:
         if self.stats_reporter is None:
@@ -1748,7 +2063,9 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     def _generate_vbe_metadata(
         self,
         offsets: Tensor,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]],
+        batch_size_per_feature_per_rank: Optional[list[list[int]]],
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -1771,6 +2088,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             self.pooling_mode,
             self.feature_dims,
             self.current_device,
+            vbe_output,
+            vbe_output_offsets,
         )
     @torch.jit.ignore
@@ -1779,40 +2098,17 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         # This allows models using this class to compile correctly
         return FeatureGate.is_enabled(feature)
-    def writeback_update_gradient(
-        self, indices: torch.Tensor, offsets: torch.Tensor, grad: Tensor
-    ) -> Tensor:
-        if indices.numel() == 0:
-            return grad[0]
-        num_of_tables = len(set(self.feature_table_map))
-        assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
-        batch_size = offsets.shape[0] // num_of_tables
-        max_indices = indices.max()
-        non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
-        # disable dedup across different table
-        indices = ((offsets[non_empty_index]) // batch_size) * (
-            1 + max_indices
-        ) + indices
-        grad = grad[0]
-        _, idx, counts = torch.unique(
-            indices, dim=0, sorted=True, return_inverse=True, return_counts=True
-        )
-        _, ind_sorted = torch.sort(idx, stable=True)
-        cum_sum = counts.cumsum(0)
-        cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
-        first_indicies = ind_sorted[cum_sum]
-        mask = torch.zeros_like(grad, device=grad.device)
-        original_index = non_empty_index[first_indicies]
-        mask[original_index] = grad[original_index]
-        return mask
     # pyre-fixme[2]: For 1st argument expected not ANY
-    def writeback_hook(self, module: Any, grad: Tensor) -> Tuple[Tensor]:
+    def writeback_hook(self, module: Any, grad: Tensor) -> tuple[Tensor]:
         indices = self._indices
         offsets = self._offsets
-        return (self.writeback_update_gradient(indices, offsets, grad),)
+        return writeback_gradient(
+            grad,
+            indices,
+            offsets,
+            self.feature_table_map,
+            self._writeback_first_feature_only,
+        )
     def forward(  # noqa: C901
         self,
@@ -1820,8 +2116,12 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         offsets: Tensor,
         per_sample_weights: Optional[Tensor] = None,
         feature_requires_grad: Optional[Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
         total_unique_indices: Optional[int] = None,
+        hash_zch_identities: Optional[Tensor] = None,
+        hash_zch_runtime_meta: Optional[Tensor] = None,
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
     ) -> Tensor:
         """
         The forward pass function that
@@ -1874,7 +2174,22 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 be set when using `OptimType.NONE`. This is because TBE
                 requires this information for allocating the weight gradient
                 tensor in the backward pass.
+            hash_zch_identities (Optional[Tensor]): The original raw IDs before
+                remapping to ZCH (Zero-Collision Hash) table slots. This tensor is
+                populated when using Multi-Probe Zero Collision Hash (MPZCH) modules
+                and is required for Raw Embedding Streaming (RES) to maintain
+                consistency between training and inference.
+            vbe_output (Optional[Tensor]): An optional 2-D tensor of size that
+                contains output for TBE VBE. The shape of the tensor is
+                [1, total_vbe_output_size] where total_vbe_output_size is the
+                output size across all ranks and all embedding tables.
+                If this tensor is not None, the TBE VBE forward output is written
+                to this tensor at the locations specified by `vbe_output_offsets`.
+            vbe_output_offsets (Optional[Tensor]): An optional 2-D tensor that
+                contains VBE output offsets to `vbe_output`. The shape of the
+                tensor is [num_ranks, num_features].
+                vbe_output_offsets[r][f] represents the starting offset for rank `r`
+                and feature `f`.
         Returns:
             A 2D-tensor containing looked up data. Shape `(B, total_D)` where `B` =
             batch size and `total_D` = the sum of all embedding dimensions in the
@@ -1948,11 +2263,34 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             batch_size_per_feature_per_rank,
             force_cast_input_types=True,
             prefetch_pipeline=False,
+            vbe_output=vbe_output,
+            vbe_output_offsets=vbe_output_offsets,
         )
+        # Only enable VBE if batch_size_per_feature_per_rank is not None
+        assert not (
+            batch_size_per_feature_per_rank is not None
+            and self.use_writeback_bwd_prehook
+        ), "VBE is not supported with writeback_bwd_prehook"
         # Print input stats if enable (for debugging purpose only)
         self._debug_print_input_stats(indices, offsets, per_sample_weights)
+        # Extract and Write input stats if enable
+        if self._report_input_params is not None:
+            self._report_input_params(
+                feature_rows=self.rows_per_table,
+                feature_dims=self.feature_dims,
+                iteration=self.iter_cpu.item() if hasattr(self, "iter_cpu") else 0,
+                indices=indices,
+                offsets=offsets,
+                op_id=self.uuid,
+                per_sample_weights=per_sample_weights,
+                batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+                embedding_specs=[(s[0], s[1]) for s in self.embedding_specs],
+                feature_table_map=self.feature_table_map,
+            )
         if not is_torchdynamo_compiling():
             # Mutations of nn.Module attr forces dynamo restart of Analysis which increases compilation time
@@ -1980,7 +2318,12 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             # to be as fast as possible and memory usage doesn't matter (will be recycled
             # by dense fwd/bwd)
             self._prefetch(
-                indices, offsets, vbe_metadata, multipass_prefetch_config=None
+                indices,
+                offsets,
+                vbe_metadata,
+                multipass_prefetch_config=None,
+                hash_zch_identities=hash_zch_identities,
+                hash_zch_runtime_meta=hash_zch_runtime_meta,
             )
         if len(self.timesteps_prefetched) > 0:
@@ -2262,6 +2605,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                         row_counter,
                         iter_int,
                         self.max_counter.item(),
+                        mixed_D=self.mixed_D,
                     ),
                 )
             elif self._used_rowwise_adagrad_with_global_weight_decay:
@@ -2280,6 +2624,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                         #  `Optional[Tensor]` but got `Union[Module, Tensor]`.
                         prev_iter_dev=self.prev_iter_dev,
                         gwd_lower_bound=self.gwd_lower_bound,
+                        mixed_D=self.mixed_D,
                     ),
                 )
             else:
@@ -2289,12 +2634,13 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                         common_args,
                         self.optimizer_args,
                         momentum1,
+                        mixed_D=self.mixed_D,
                     ),
                 )
         raise ValueError(f"Invalid OptimType: {self.optimizer}")
-    def ema_inplace(self, emainplace_mode: Dict[str, float]) -> None:
+    def ema_inplace(self, emainplace_mode: dict[str, float]) -> None:
         """
         Perform ema operations on the full sparse embedding tables.
         We organize the sparse table, in the following way.
@@ -2324,7 +2670,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                     emainplace_mode["step_ema_coef"],
                 )
-    def ensemble_and_swap(self, ensemble_mode: Dict[str, float]) -> None:
+    def ensemble_and_swap(self, ensemble_mode: dict[str, float]) -> None:
         """
         Perform ensemble and swap operations on the full sparse embedding tables.
@@ -2372,7 +2718,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         ), "gather_uvm_cache_stats should be set to true to access uvm cache stats."
         return self.local_uvm_cache_stats if use_local_cache else self.uvm_cache_stats
-    def _get_uvm_cache_print_state(self, use_local_cache: bool = False) -> List[float]:
+    def _get_uvm_cache_print_state(self, use_local_cache: bool = False) -> list[float]:
         snapshot = self.get_uvm_cache_stats(use_local_cache)
         if use_local_cache:
             return snapshot.tolist()
@@ -2385,7 +2731,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     @torch.jit.ignore
     def print_uvm_cache_stats(self, use_local_cache: bool = False) -> None:
         # TODO: Create a separate reporter class to unify the stdlog reporting
-        uvm_cache_stats: List[float] = self._get_uvm_cache_print_state(use_local_cache)
+        uvm_cache_stats: list[float] = self._get_uvm_cache_print_state(use_local_cache)
         N = max(1, uvm_cache_stats[0])
         m = {
             "N_called": uvm_cache_stats[UVMCacheStatsIndex.num_calls],
@@ -2429,14 +2775,14 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         if not stats_reporter.should_report(self.step):
             return
-        uvm_cache_stats: List[float] = self.get_uvm_cache_stats(
+        uvm_cache_stats: list[float] = self.get_uvm_cache_stats(
             use_local_cache=False
         ).tolist()
         self.last_reported_step = self.step
         if len(self.last_reported_uvm_stats) == 0:
             self.last_reported_uvm_stats = [0.0] * len(uvm_cache_stats)
-        uvm_cache_stats_delta: List[float] = [0.0] * len(uvm_cache_stats)
+        uvm_cache_stats_delta: list[float] = [0.0] * len(uvm_cache_stats)
         for i in range(len(uvm_cache_stats)):
             uvm_cache_stats_delta[i] = (
                 uvm_cache_stats[i] - self.last_reported_uvm_stats[i]
@@ -2465,7 +2811,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         indices: Tensor,
         offsets: Tensor,
         forward_stream: Optional[torch.cuda.Stream] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
     ) -> None:
         if self.prefetch_stream is None and forward_stream is not None:
             self.prefetch_stream = torch.cuda.current_stream()
@@ -2473,20 +2819,21 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 self.prefetch_stream != forward_stream
             ), "prefetch_stream and forward_stream should not be the same stream"
-        indices, offsets, _, vbe_metadata = self.prepare_inputs(
-            indices,
-            offsets,
-            per_sample_weights=None,
-            batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
-            force_cast_input_types=False,
-            prefetch_pipeline=self.prefetch_pipeline,
-        )
         with self._recording_to_timer(
             self.prefetch_duration_timer,
             context=self.step,
             stream=torch.cuda.current_stream(),
         ):
+            indices, offsets, _, vbe_metadata = self.prepare_inputs(
+                indices,
+                offsets,
+                per_sample_weights=None,
+                batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+                force_cast_input_types=False,
+                prefetch_pipeline=self.prefetch_pipeline,
+            )
             self._prefetch(
                 indices,
                 offsets,
@@ -2503,6 +2850,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         offsets: Tensor,
         vbe_metadata: Optional[invokers.lookup_args.VBEMetadata] = None,
         multipass_prefetch_config: Optional[MultiPassPrefetchConfig] = None,
+        hash_zch_identities: Optional[Tensor] = None,
+        hash_zch_runtime_meta: Optional[Tensor] = None,
     ) -> None:
         if not is_torchdynamo_compiling():
             # Mutations of nn.Module attr forces dynamo restart of Analysis which increases compilation time
@@ -2521,7 +2870,13 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             self.local_uvm_cache_stats.zero_()
         self._report_io_size_count("prefetch_input", indices)
+        # streaming before updating the cache
+        self.raw_embedding_stream()
         final_lxu_cache_locations = torch.empty_like(indices, dtype=torch.int32)
+        linear_cache_indices_merged = torch.zeros(
+            0, dtype=indices.dtype, device=indices.device
+        )
         for (
             partial_indices,
             partial_lxu_cache_locations,
@@ -2537,6 +2892,9 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 vbe_metadata.max_B if vbe_metadata is not None else -1,
                 base_offset,
             )
+            linear_cache_indices_merged = torch.cat(
+                [linear_cache_indices_merged, linear_cache_indices]
+            )
             if (
                 self.record_cache_metrics.record_cache_miss_counter
@@ -2617,6 +2975,16 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             if self.should_log():
                 self.print_uvm_cache_stats(use_local_cache=False)
+        self._store_prefetched_tensors(
+            indices,
+            offsets,
+            vbe_metadata,
+            linear_cache_indices_merged,
+            final_lxu_cache_locations,
+            hash_zch_identities,
+            hash_zch_runtime_meta,
+        )
     def should_log(self) -> bool:
         """Determines if we should log for this step, using exponentially decreasing frequency.
@@ -2701,12 +3069,34 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 tmp_emb.uniform_(min_val, max_val)
                 tmp_emb_i8 = torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized(tmp_emb)
                 emb.data.copy_(tmp_emb_i8)
+        # Torch doesnt implement direct fp8 distribution functions, so we need to start in higher precision.
+        elif self.weights_precision == SparseType.NFP8:
+            assert (
+                self.current_device.type == "cuda"
+            ), "NFP8 is currently only supportd on GPU."
+            assert self.optimizer in [
+                OptimType.EXACT_ADAGRAD,
+                OptimType.ROWWISE_ADAGRAD,
+                OptimType.EXACT_ROWWISE_ADAGRAD,
+                OptimType.ENSEMBLE_ROWWISE_ADAGRAD,
+                OptimType.EMAINPLACE_ROWWISE_ADAGRAD,
+            ], "NFP8 is currently only supportd with adagrad optimizers."
+            for param in splits:
+                tmp_param = torch.zeros(param.shape, device=self.current_device)
+                # Create initialized weights and cast to fp8.
+                fp8_dtype = (
+                    torch.float8_e4m3fnuz
+                    if torch.version.hip is not None
+                    else torch.float8_e4m3fn
+                )
+                tmp_param.uniform_(min_val, max_val).to(fp8_dtype)
+                param.data.copy_(tmp_param)
         else:
             for param in splits:
                 param.uniform_(min_val, max_val)
     @torch.jit.ignore
-    def split_embedding_weights(self) -> List[Tensor]:
+    def split_embedding_weights(self) -> list[Tensor]:
         """
         Returns a list of embedding weights (view), split by table
@@ -2748,7 +3138,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         raise ValueError(f"Optimizer buffer {state} not found")
     @torch.jit.export
-    def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]:
+    def get_optimizer_state(self) -> list[dict[str, torch.Tensor]]:
         r"""
         Get the optimizer state dict that matches the OSS Pytorch optims
         TODO: populate the supported list of optimizers
@@ -2832,7 +3222,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     @torch.jit.ignore
     def split_optimizer_states(
         self,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
         """
         Returns a list of optimizer states (view), split by table
@@ -2880,7 +3270,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             state_offsets: Tensor,
             state_placements: Tensor,
             rowwise: bool,
-        ) -> List[torch.Tensor]:
+        ) -> list[torch.Tensor]:
             splits = []
             for t, (rows, dim, _, _) in enumerate(self.embedding_specs):
                 offset = state_offsets[t]
@@ -2899,7 +3289,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                     splits.append(state.detach()[offset : offset + rows].view(rows))
             return splits
-        states: List[List[torch.Tensor]] = []
+        states: list[list[torch.Tensor]] = []
         if self.optimizer not in (OptimType.EXACT_SGD,):
             states.append(
                 get_optimizer_states(
@@ -3025,7 +3415,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         return self.learning_rate_tensor.item()
     @torch.jit.ignore
-    def update_hyper_parameters(self, params_dict: Dict[str, float]) -> None:
+    def update_hyper_parameters(self, params_dict: dict[str, float]) -> None:
         """
         Sets hyper-parameters from external control flow.
@@ -3101,10 +3491,10 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self,
         split: SplitState,
         prefix: str,
-        dtype: Type[torch.dtype],
+        dtype: type[torch.dtype],
         enforce_hbm: bool = False,
         make_dev_param: bool = False,
-        dev_reshape: Optional[Tuple[int, ...]] = None,
+        dev_reshape: Optional[tuple[int, ...]] = None,
         uvm_host_mapped: bool = False,
     ) -> None:
         apply_split_helper(
@@ -3154,6 +3544,9 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             dtype = torch.float32
         elif cache_precision == SparseType.FP16:
             dtype = torch.float16
+        elif cache_precision == SparseType.NFP8:
+            # NFP8 weights use floating point cache.
+            dtype = torch.float16
         else:
             dtype = torch.float32  # not relevant, but setting it to keep linter happy
             if not self.use_cpu > 0:
@@ -3347,7 +3740,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     def _update_cache_counter_and_locations(
         self,
         module: nn.Module,
-        grad_input: Union[Tuple[Tensor, ...], Tensor],
+        grad_input: Union[tuple[Tensor, ...], Tensor],
     ) -> None:
         """
         Backward prehook function when prefetch_pipeline is enabled.
@@ -3543,10 +3936,12 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         indices: Tensor,
         offsets: Tensor,
         per_sample_weights: Optional[Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
         force_cast_input_types: bool = True,
         prefetch_pipeline: bool = False,
-    ) -> Tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
+        vbe_output: Optional[Tensor] = None,
+        vbe_output_offsets: Optional[Tensor] = None,
+    ) -> tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs as follows:
@@ -3572,9 +3967,20 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             metadata
         """
+        if vbe_output is not None or vbe_output_offsets is not None:
+            assert (
+                not self.use_cpu
+            ), "[TBE API v2] Using pre-allocated vbe_output is not supported on CPU"
+            check_allocated_vbe_output(
+                self.output_dtype,
+                batch_size_per_feature_per_rank,
+                vbe_output,
+                vbe_output_offsets,
+            )
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
-            offsets, batch_size_per_feature_per_rank
+            offsets, batch_size_per_feature_per_rank, vbe_output, vbe_output_offsets
         )
         vbe = vbe_metadata.B_offsets is not None
@@ -3647,7 +4053,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                     self.is_nobag,
                     vbe_metadata.max_B_feature_rank,
                     self.info_B_num_bits,
-                    offsets.numel() - 1,  # total_B
+                    offsets.numel() - 1,  # total_B,
+                    vbe_output_offsets,
                 )
             else:
                 b_t_map = None
@@ -3736,7 +4143,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 # Counts of indices that segment lengths > 1024
                 counts_cta_per_row_mth = counts_cta_per_row[counts_cta_per_row > 1024]
-                def compute_numel_and_avg(counts: Tensor) -> Tuple[int, float]:
+                def compute_numel_and_avg(counts: Tensor) -> tuple[int, float]:
                     numel = counts.numel()
                     avg = (counts.sum().item() / numel) if numel != 0 else -1.0
                     return numel, avg
@@ -3804,6 +4211,240 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             return _debug_print_input_stats_factory_impl
         return _debug_print_input_stats_factory_null
+    @torch.jit.ignore
+    def raw_embedding_stream(self) -> None:
+        if not self.enable_raw_embedding_streaming:
+            return None
+        # when pipelining is enabled
+        # prefetch in iter i happens before the backward sparse in iter i - 1
+        # so embeddings for iter i - 1's changed ids are not updated.
+        # so we can only fetch the indices from the iter i - 2
+        # when pipelining is disabled
+        # prefetch in iter i happens before forward iter i
+        # so we can get the iter i - 1's changed ids safely.
+        target_prev_iter = 1
+        if self.prefetch_pipeline:
+            target_prev_iter = 2
+        if not len(self.prefetched_info_list) > (target_prev_iter - 1):
+            return None
+        with record_function(
+            "## uvm_lookup_prefetched_rows {} {} ##".format(self.timestep, self.uuid)
+        ):
+            prefetched_info = self.prefetched_info_list.pop(0)
+            updated_locations = torch.ops.fbgemm.lxu_cache_lookup(
+                prefetched_info.linear_unique_cache_indices,
+                self.lxu_cache_state,
+                self.total_cache_hash_size,
+                gather_cache_stats=False,  # not collecting cache stats
+                num_uniq_cache_indices=prefetched_info.linear_unique_indices_length,
+            )
+            updated_weights = torch.empty(
+                [
+                    prefetched_info.linear_unique_cache_indices.size()[0],
+                    self.max_D_cache,
+                ],
+                # pyre-ignore Incompatible parameter type [6]: In call `torch._C._VariableFunctions.empty`, for argument `dtype`, expected `Optional[dtype]` but got `Union[Module, dtype, Tensor]`
+                dtype=self.lxu_cache_weights.dtype,
+                # pyre-ignore Incompatible parameter type [6]: In call `torch._C._VariableFunctions.empty`, for argument `device`, expected `Union[None, int, str, device]` but got `Union[Module, device, Tensor]`
+                device=self.lxu_cache_weights.device,
+            )
+            torch.ops.fbgemm.masked_index_select(
+                updated_weights,
+                updated_locations,
+                self.lxu_cache_weights,
+                prefetched_info.linear_unique_indices_length,
+            )
+            # TODO: this statement triggers a sync
+            # added here to make this diff self-contained
+            # will remove in later change
+            cache_hit_mask_index = (
+                updated_locations.narrow(
+                    0, 0, prefetched_info.linear_unique_indices_length.item()
+                )
+                .not_equal(-1)
+                .nonzero()
+                .flatten()
+            )
+            # stream weights
+            self._raw_embedding_streamer.stream(
+                prefetched_info.linear_unique_indices.index_select(
+                    dim=0, index=cache_hit_mask_index
+                ).to(device=torch.device("cpu")),
+                updated_weights.index_select(dim=0, index=cache_hit_mask_index).to(
+                    device=torch.device("cpu")
+                ),
+                (
+                    prefetched_info.hash_zch_identities.index_select(
+                        dim=0, index=cache_hit_mask_index
+                    ).to(device=torch.device("cpu"))
+                    if prefetched_info.hash_zch_identities is not None
+                    else None
+                ),
+                (
+                    prefetched_info.hash_zch_runtime_meta.index_select(
+                        dim=0, index=cache_hit_mask_index
+                    ).to(device=torch.device("cpu"))
+                    if prefetched_info.hash_zch_runtime_meta is not None
+                    else None
+                ),
+                prefetched_info.linear_unique_indices_length.to(
+                    device=torch.device("cpu")
+                ),
+                False,  # require_tensor_copy
+                False,  # blocking_tensor_copy
+            )
+    @staticmethod
+    @torch.jit.ignore
+    def _get_prefetched_info(
+        linear_indices: torch.Tensor,
+        linear_cache_indices_merged: torch.Tensor,
+        total_cache_hash_size: int,
+        hash_zch_identities: Optional[torch.Tensor],
+        hash_zch_runtime_meta: Optional[torch.Tensor],
+        max_indices_length: int,
+    ) -> PrefetchedInfo:
+        (
+            linear_unique_cache_indices,
+            linear_unique_cache_indices_length,
+            linear_unique_cache_indices_cnt,
+            linear_unique_cache_inverse_indices,
+        ) = torch.ops.fbgemm.get_unique_indices_with_inverse(
+            linear_cache_indices_merged,
+            total_cache_hash_size,
+            compute_count=True,
+            compute_inverse_indices=True,
+        )
+        # pure cpu op, no need to sync, to avoid the indices out size the weights buffer
+        max_len = min(
+            max_indices_length,
+            linear_unique_cache_indices.size(0),
+        )
+        if max_len < linear_unique_cache_indices.size(0):
+            linear_unique_cache_indices_length.clamp_(max=max_len)
+            # linear_unique_indices is the result after deduplication and sorting
+            linear_unique_cache_indices = linear_unique_cache_indices.narrow(
+                0, 0, max_len
+            )
+        # Compute cumulative sum as indices for selecting unique elements to
+        # map hash_zch_identities and hash_zch_runtime_meta to linear_unique_indices
+        count_cum_sum = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            linear_unique_cache_indices_cnt
+        )
+        # count_cum_sum will be one more element than linear_unique_cache_indices_cnt
+        count_cum_sum = count_cum_sum.narrow(0, 0, max_len)
+        # clamp the uninitialized elements to avoid out of bound access
+        # the uninitialized elements will be sliced out by linear_unique_cache_indices_length
+        # directly using linear_unique_cache_indices_length requires a sync
+        count_cum_sum.clamp_(min=0, max=linear_unique_cache_inverse_indices.size(0) - 1)
+        # Select indices corresponding to first occurrence of each unique element
+        linear_unique_inverse_indices = (
+            linear_unique_cache_inverse_indices.index_select(dim=0, index=count_cum_sum)
+        )
+        # same as above clamp
+        linear_unique_inverse_indices.clamp_(min=0, max=linear_indices.size(0) - 1)
+        linear_unique_indices = linear_indices.index_select(
+            dim=0, index=linear_unique_inverse_indices
+        )
+        if hash_zch_identities is not None:
+            # Map hash_zch_identities to unique indices
+            hash_zch_identities = hash_zch_identities.index_select(
+                dim=0, index=linear_unique_inverse_indices
+            )
+        if hash_zch_runtime_meta is not None:
+            # Map hash_zch_runtime_meta to unique indices
+            hash_zch_runtime_meta = hash_zch_runtime_meta.index_select(
+                dim=0, index=linear_unique_inverse_indices
+            )
+        return PrefetchedInfo(
+            linear_unique_indices,
+            linear_unique_cache_indices,
+            linear_unique_cache_indices_length,
+            hash_zch_identities,
+            hash_zch_runtime_meta,
+        )
+    @torch.jit.ignore
+    def _store_prefetched_tensors(
+        self,
+        indices: torch.Tensor,
+        offsets: torch.Tensor,
+        vbe_metadata: Optional[invokers.lookup_args.VBEMetadata],
+        linear_cache_indices_merged: torch.Tensor,
+        final_lxu_cache_locations: torch.Tensor,
+        hash_zch_identities: Optional[torch.Tensor],
+        hash_zch_runtime_meta: Optional[torch.Tensor],
+    ) -> None:
+        """
+        NOTE: this needs to be a method with jit.ignore as the identities tensor is conditional.
+        This function stores the prefetched tensors for the raw embedding streaming.
+        """
+        if not self.enable_raw_embedding_streaming:
+            return
+        with record_function(
+            "## uvm_save_prefetched_rows {} {} ##".format(self.timestep, self.uuid)
+        ):
+            found_in_cache_mask = final_lxu_cache_locations != -1
+            # only process the indices that are found in the cache
+            # this will filter out the indices from tables that doesn't have UVM_CACHE enabled
+            linear_cache_indices_merged_masked = torch.where(
+                found_in_cache_mask,
+                linear_cache_indices_merged,
+                self.total_cache_hash_size,
+            )
+            linearize_indices = torch.ops.fbgemm.linearize_cache_indices(
+                self.hash_size_cumsum,
+                indices,
+                offsets,
+                vbe_metadata.B_offsets if vbe_metadata is not None else None,
+                vbe_metadata.max_B if vbe_metadata is not None else -1,
+            )
+            # -1 indices are ignored in raw_embedding_streamer.
+            linearize_indices_masked = torch.where(
+                found_in_cache_mask,
+                linearize_indices,
+                -1,
+            )
+            # Process hash_zch_identities using helper function
+            prefetched_info = self._get_prefetched_info(
+                linearize_indices_masked,
+                linear_cache_indices_merged_masked,
+                self.total_cache_hash_size,
+                hash_zch_identities,
+                hash_zch_runtime_meta,
+                self.lxu_cache_weights.size(0),
+            )
+            self.prefetched_info_list.append(prefetched_info)
+    @torch.jit.ignore
+    def __report_input_params_factory(
+        self,
+    ) -> Optional[Callable[..., None]]:
+        """
+        This function returns a function pointer based on the environment variable `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL`.
+        If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is set to a value greater than 0, it returns a function pointer that:
+        - Reports input parameters (TBEDataConfig).
+        - Writes the output as a JSON file.
+        If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is not set or is set to 0, it returns a dummy function pointer that performs no action.
+        """
+        try:
+            if self._feature_is_enabled(FeatureGateName.TBE_REPORT_INPUT_PARAMS):
+                from fbgemm_gpu.tbe.stats import TBEBenchmarkParamsReporter
+                reporter = TBEBenchmarkParamsReporter.create()
+                return reporter.report_stats
+        except Exception:
+            return None
+        return None
 class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
     """
@@ -3817,12 +4458,12 @@ class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
     max_D: int
     hash_size_cumsum: Tensor
     total_hash_size_bits: int
-    embedding_specs: List[Tuple[int, int]]
+    embedding_specs: list[tuple[int, int]]
     def __init__(
         self,
-        embedding_specs: List[Tuple[int, int]],  # tuple of (rows, dims)
-        feature_table_map: Optional[List[int]] = None,  # [T]
+        embedding_specs: list[tuple[int, int]],  # tuple of (rows, dims)
+        feature_table_map: Optional[list[int]] = None,  # [T]
         weights_precision: SparseType = SparseType.FP32,
         pooling_mode: PoolingMode = PoolingMode.SUM,
         use_cpu: bool = False,
@@ -3865,7 +4506,7 @@ class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
         )
         self.embedding_specs = embedding_specs
-        (rows, dims) = zip(*embedding_specs)
+        rows, dims = zip(*embedding_specs)
         T_ = len(self.embedding_specs)
         assert T_ > 0
@@ -3935,7 +4576,7 @@ class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
                 row for (row, _) in embedding_specs[:t]
             )
-        self.weights_physical_offsets: List[int] = weights_offsets
+        self.weights_physical_offsets: list[int] = weights_offsets
         weights_offsets = [weights_offsets[t] for t in feature_table_map]
         self.register_buffer(
             "weights_offsets",
@@ -3962,7 +4603,7 @@ class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
     def _generate_vbe_metadata(
         self,
         offsets: Tensor,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]],
+        batch_size_per_feature_per_rank: Optional[list[list[int]]],
     ) -> invokers.lookup_args.VBEMetadata:
         # Blocking D2H copy, but only runs at first call
         self.feature_dims = self.feature_dims.cpu()
@@ -3980,7 +4621,7 @@ class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
         offsets: Tensor,
         per_sample_weights: Optional[Tensor] = None,
         feature_requires_grad: Optional[Tensor] = None,
-        batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        batch_size_per_feature_per_rank: Optional[list[list[int]]] = None,
     ) -> Tensor:
         # Generate VBE metadata
         vbe_metadata = self._generate_vbe_metadata(
@@ -4019,7 +4660,7 @@ class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
         )
     @torch.jit.export
-    def split_embedding_weights(self) -> List[Tensor]:
+    def split_embedding_weights(self) -> list[Tensor]:
         """
         Returns a list of weights, split by table
         """