PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/split_table_batched_embeddings_ops_common.py CHANGED Viewed

@@ -11,12 +11,11 @@
 import enum
 from dataclasses import dataclass
-from typing import List, NamedTuple
+from typing import FrozenSet, NamedTuple, Optional, Tuple
 import torch
 from torch import Tensor
 # Maximum number of times prefetch() can be called without
 # a corresponding forward() call
 MAX_PREFETCH_DEPTH = 100
@@ -33,6 +32,17 @@ class EmbeddingLocation(enum.IntEnum):
     HOST = 3
     MTIA = 4
+    @classmethod
+    # pyre-ignore[3]
+    def str_values(cls):
+        return [
+            "device",
+            "managed",
+            "managed_caching",
+            "host",
+            "mtia",
+        ]
     @classmethod
     # pyre-ignore[3]
     def from_str(cls, key: str):
@@ -49,6 +59,246 @@ class EmbeddingLocation(enum.IntEnum):
             raise ValueError(f"Cannot parse value into EmbeddingLocation: {key}")
+class EvictionPolicy(NamedTuple):
+    eviction_trigger_mode: int = (
+        0  # disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual 4: id count
+    )
+    eviction_strategy: int = (
+        0  # 0: timestamp, 1: counter , 2: counter + timestamp, 3: feature l2 norm 4: timestamp threshold 5: feature score
+    )
+    eviction_step_intervals: Optional[int] = (
+        None  # trigger_step_interval if trigger mode is iteration
+    )
+    eviction_mem_threshold_gb: Optional[int] = (
+        None  # eviction trigger condition if trigger mode is mem_util
+    )
+    counter_thresholds: Optional[list[int]] = (
+        None  # count_thresholds for each table if eviction strategy is counter
+    )
+    ttls_in_mins: Optional[list[int]] = (
+        None  # ttls_in_mins for each table if eviction strategy is timestamp
+    )
+    counter_decay_rates: Optional[list[float]] = (
+        None  # count_decay_rates for each table if eviction strategy is counter
+    )
+    feature_score_counter_decay_rates: Optional[list[float]] = (
+        None  # feature_score_counter_decay_rates for each table if eviction strategy is feature score
+    )
+    training_id_eviction_trigger_count: Optional[list[int]] = (
+        None  # Number of training IDs that, when exceeded, will trigger eviction for each table.
+    )
+    training_id_keep_count: Optional[list[int]] = (
+        None  # Target number of training IDs to retain in each table after eviction.
+    )
+    l2_weight_thresholds: Optional[list[float]] = (
+        None  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+    )
+    threshold_calculation_bucket_stride: Optional[float] = (
+        0.2  # The width of each feature score bucket used for threshold calculation in feature score-based eviction.
+    )
+    threshold_calculation_bucket_num: Optional[int] = (
+        1000000  # 1M, Total number of feature score buckets used for threshold calculation in feature score-based eviction.
+    )
+    interval_for_insufficient_eviction_s: int = (
+        # wait at least # seconds before trigger next round of eviction, if last finished eviction is insufficient
+        # insufficient means we didn't evict enough rows, so we want to wait longer time to
+        # avoid another insufficient eviction
+        600
+    )
+    interval_for_sufficient_eviction_s: int = (
+        # wait at least # seconds before trigger next round of eviction, if last finished eviction is sufficient
+        60
+    )
+    interval_for_feature_statistics_decay_s: int = (
+        24 * 3600  # 1 day, interval for feature statistics decay
+    )
+    meta_header_lens: Optional[list[int]] = None  # metaheader length for each table
+    eviction_free_mem_threshold_gb: Optional[int] = (
+        None  # Minimum free memory (in GB) required before triggering eviction when using free_mem trigger mode.
+    )
+    eviction_free_mem_check_interval_batch: Optional[int] = (
+        None  # Number of batches between checks for free memory threshold when using free_mem trigger mode.
+    )
+    enable_eviction_for_feature_score_eviction_policy: Optional[list[bool]] = (
+        None  # enable eviction if eviction policy is feature score, false means no eviction
+    )
+    def validate(self) -> None:
+        assert self.eviction_trigger_mode in [0, 1, 2, 3, 4, 5], (
+            "eviction_trigger_mode must be 0, 1, 2, 3, 4, 5"
+            f"actual {self.eviction_trigger_mode}"
+        )
+        if self.eviction_trigger_mode == 0:
+            return
+        assert self.eviction_strategy in [0, 1, 2, 3, 4, 5], (
+            "eviction_strategy must be 0, 1, 2, 3, 4 or 5, "
+            f"actual {self.eviction_strategy}"
+        )
+        if self.eviction_trigger_mode == 1:
+            assert (
+                self.eviction_step_intervals is not None
+                and self.eviction_step_intervals > 0
+            ), (
+                "eviction_step_intervals must be positive if eviction_trigger_mode is 1, "
+                f"actual {self.eviction_step_intervals}"
+            )
+        elif self.eviction_trigger_mode == 2:
+            assert (
+                self.eviction_mem_threshold_gb is not None
+            ), "eviction_mem_threshold_gb must be set if eviction_trigger_mode is 2"
+        elif self.eviction_trigger_mode == 4:
+            assert (
+                self.training_id_eviction_trigger_count is not None
+            ), "training_id_eviction_trigger_count must be set if eviction_trigger_mode is 4"
+        elif self.eviction_trigger_mode == 5:
+            assert (
+                self.eviction_free_mem_threshold_gb is not None
+            ), "eviction_free_mem_threshold_gb must be set if eviction_trigger_mode is 5"
+            assert (
+                self.eviction_free_mem_check_interval_batch is not None
+            ), "eviction_free_mem_check_interval_batch must be set if eviction_trigger_mode is 5"
+        if self.eviction_strategy == 0:
+            assert self.ttls_in_mins is not None, (
+                "ttls_in_mins must be set if eviction_strategy is 0, "
+                f"actual {self.ttls_in_mins}"
+            )
+        elif self.eviction_strategy == 1:
+            assert self.counter_thresholds is not None, (
+                "counter_thresholds must be set if eviction_strategy is 1, "
+                f"actual {self.counter_thresholds}"
+            )
+            assert self.counter_decay_rates is not None, (
+                "counter_decay_rates must be set if eviction_strategy is 1, "
+                f"actual {self.counter_decay_rates}"
+            )
+            assert len(self.counter_thresholds) == len(self.counter_decay_rates), (
+                "counter_thresholds and counter_decay_rates must have the same length, "
+                f"actual {self.counter_thresholds} vs {self.counter_decay_rates}"
+            )
+        elif self.eviction_strategy == 2:
+            assert self.counter_thresholds is not None, (
+                "counter_thresholds must be set if eviction_strategy is 2, "
+                f"actual {self.counter_thresholds}"
+            )
+            assert self.counter_decay_rates is not None, (
+                "counter_decay_rates must be set if eviction_strategy is 2, "
+                f"actual {self.counter_decay_rates}"
+            )
+            assert self.ttls_in_mins is not None, (
+                "ttls_in_mins must be set if eviction_strategy is 2, "
+                f"actual {self.ttls_in_mins}"
+            )
+            assert len(self.counter_thresholds) == len(self.counter_decay_rates), (
+                "counter_thresholds and counter_decay_rates must have the same length, "
+                f"actual {self.counter_thresholds} vs {self.counter_decay_rates}"
+            )
+            assert len(self.counter_thresholds) == len(self.ttls_in_mins), (
+                "counter_thresholds and ttls_in_mins must have the same length, "
+                f"actual {self.counter_thresholds} vs {self.ttls_in_mins}"
+            )
+        elif self.eviction_strategy == 5:
+            assert self.feature_score_counter_decay_rates is not None, (
+                "feature_score_counter_decay_rates must be set if eviction_strategy is 5, "
+                f"actual {self.feature_score_counter_decay_rates}"
+            )
+            assert self.training_id_eviction_trigger_count is not None, (
+                "training_id_eviction_trigger_count must be set if eviction_strategy is 5,"
+                f"actual {self.training_id_eviction_trigger_count}"
+            )
+            assert self.training_id_keep_count is not None, (
+                "training_id_keep_count must be set if eviction_strategy is 5,"
+                f"actual {self.training_id_keep_count}"
+            )
+            assert self.threshold_calculation_bucket_stride is not None, (
+                "threshold_calculation_bucket_stride must be set if eviction_strategy is 5,"
+                f"actual {self.threshold_calculation_bucket_stride}"
+            )
+            assert self.threshold_calculation_bucket_num is not None, (
+                "threshold_calculation_bucket_num must be set if eviction_strategy is 5,"
+                f"actual {self.threshold_calculation_bucket_num}"
+            )
+            assert self.enable_eviction_for_feature_score_eviction_policy is not None, (
+                "enable_eviction_for_feature_score_eviction_policy must be set if eviction_strategy is 5,"
+                f"actual {self.enable_eviction_for_feature_score_eviction_policy}"
+            )
+            assert (
+                len(self.enable_eviction_for_feature_score_eviction_policy)
+                == len(self.training_id_keep_count)
+                == len(self.feature_score_counter_decay_rates)
+            ), (
+                "feature_score_thresholds, enable_eviction_for_feature_score_eviction_policy, and training_id_keep_count must have the same length, "
+                f"actual {self.training_id_keep_count} vs {self.feature_score_counter_decay_rates} vs {self.enable_eviction_for_feature_score_eviction_policy}"
+            )
+class KVZCHParams(NamedTuple):
+    # global bucket id start and global bucket id end offsets for each logical table,
+    # where start offset is inclusive and end offset is exclusive
+    bucket_offsets: list[tuple[int, int]] = []
+    # bucket size for each logical table
+    # the value indicates corresponding input space for each bucket id, e.g. 2^50 / total_num_buckets
+    bucket_sizes: list[int] = []
+    # enable optimizer offloading or not
+    enable_optimizer_offloading: bool = False
+    # when enabled, backend will return whole row(metaheader + weight + optimizer) instead of weight only
+    # can only be enabled when enable_optimizer_offloading is enabled
+    backend_return_whole_row: bool = False
+    eviction_policy: EvictionPolicy = EvictionPolicy()
+    embedding_cache_mode: bool = False
+    load_ckpt_without_opt: bool = False
+    optimizer_type_for_st: Optional[str] = None
+    optimizer_state_dtypes_for_st: Optional[FrozenSet[Tuple[str, int]]] = None
+    def validate(self) -> None:
+        assert len(self.bucket_offsets) == len(self.bucket_sizes), (
+            "bucket_offsets and bucket_sizes must have the same length, "
+            f"actual {self.bucket_offsets} vs {self.bucket_sizes}"
+        )
+        self.eviction_policy.validate()
+        assert (
+            not self.backend_return_whole_row or self.enable_optimizer_offloading
+        ), "backend_return_whole_row can only be enabled when enable_optimizer_offloading is enabled"
+class KVZCHTBEConfig(NamedTuple):
+    # Eviction trigger model for kvzch table: 0: disabled, 1: iteration, 2: mem_util, 3: manual, 4: id count, 5: free_mem
+    kvzch_eviction_trigger_mode: int = 2  # mem_util
+    # Minimum free memory (in GB) required before triggering eviction when using free_mem trigger mode.
+    eviction_free_mem_threshold_gb: int = 200  # 200GB
+    # Number of batches between checks for free memory threshold when using free_mem trigger mode.
+    eviction_free_mem_check_interval_batch: int = 1000
+    # The width of each feature score bucket used for threshold calculation in feature score-based eviction.
+    threshold_calculation_bucket_stride: float = 0.2
+    # Total number of feature score buckets used for threshold calculation in feature score-based eviction.
+    threshold_calculation_bucket_num: Optional[int] = 1000000  # 1M
+    # When true, we only save weight to kvzch backend and not optimizer state.
+    load_ckpt_without_opt: bool = False
+    # [DO NOT USE] This is for st publish only, do not set it in your config
+    optimizer_type_for_st: Optional[str] = None
+    # [DO NOT USE] This is for st publish only, do not set it in your config
+    optimizer_state_dtypes_for_st: Optional[FrozenSet[Tuple[str, int]]] = None
+class BackendType(enum.IntEnum):
+    SSD = 0
+    DRAM = 1
+    PS = 2
+    @classmethod
+    # pyre-ignore[3]
+    def from_str(cls, key: str):
+        lookup = {
+            "ssd": BackendType.SSD,
+            "dram": BackendType.DRAM,
+        }
+        if key in lookup:
+            return lookup[key]
+        else:
+            raise ValueError(f"Cannot parse value into BackendType: {key}")
 class CacheAlgorithm(enum.Enum):
     LRU = 0
     LFU = 1
@@ -106,6 +356,12 @@ class BoundsCheckMode(enum.IntEnum):
     V2_FATAL = 6
+class ComputeDevice(enum.IntEnum):
+    CPU = 0
+    CUDA = 1
+    MTIA = 2
 class EmbeddingSpecInfo(enum.IntEnum):
     feature_names = 0
     rows = 1
@@ -125,8 +381,8 @@ SplitState: NamedTuple = NamedTuple(
         ("dev_size", int),
         ("host_size", int),
         ("uvm_size", int),
-        ("placements", List[EmbeddingLocation]),
-        ("offsets", List[int]),
+        ("placements", list[EmbeddingLocation]),
+        ("offsets", list[int]),
     ],
 )
@@ -134,15 +390,15 @@ SplitState: NamedTuple = NamedTuple(
 @dataclass
 class CacheState:
     # T + 1 elements and cache_hash_size_cumsum[-1] == total_cache_hash_size
-    cache_hash_size_cumsum: List[int]
-    cache_index_table_map: List[int]
+    cache_hash_size_cumsum: list[int]
+    cache_index_table_map: list[int]
     total_cache_hash_size: int
 def construct_cache_state(
-    row_list: List[int],
-    location_list: List[EmbeddingLocation],
-    feature_table_map: List[int],
+    row_list: list[int],
+    location_list: list[EmbeddingLocation],
+    feature_table_map: list[int],
 ) -> CacheState:
     _cache_hash_size_cumsum = [0]
     total_cache_hash_size = 0
@@ -215,3 +471,13 @@ def get_new_embedding_location(
     # UVM caching
     else:
         return EmbeddingLocation.MANAGED_CACHING
+def get_bounds_check_version_for_platform() -> int:
+    # NOTE: Use bounds_check_indices v2 on ROCm because ROCm has a
+    # constraint that the gridDim * blockDim has to be smaller than
+    # 2^32. The v1 kernel can be launched with gridDim * blockDim >
+    # 2^32 while the v2 kernel limits the gridDim size to 64 * # of
+    # SMs.  Thus, its gridDim * blockDim is guaranteed to be smaller
+    # than 2^32
+    return 2 if (torch.cuda.is_available() and torch.version.hip) else 1

fbgemm_gpu/split_table_batched_embeddings_ops_inference.py CHANGED Viewed

@@ -12,7 +12,7 @@
 import logging
 import uuid
 from itertools import accumulate
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 import fbgemm_gpu  # noqa: F401
 import torch  # usort:skip
@@ -28,6 +28,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
     EmbeddingLocation,
     EmbeddingSpecInfo,
+    get_bounds_check_version_for_platform,
     get_new_embedding_location,
     MAX_PREFETCH_DEPTH,
     PoolingMode,
@@ -91,14 +92,14 @@ def align_to_cacheline(a: int) -> int:
 def nbit_construct_split_state(
-    embedding_specs: List[Tuple[str, int, int, SparseType, EmbeddingLocation]],
+    embedding_specs: list[tuple[str, int, int, SparseType, EmbeddingLocation]],
     cacheable: bool,
     row_alignment: int,
     scale_bias_size_in_bytes: int = DEFAULT_SCALE_BIAS_SIZE_IN_BYTES,
     cacheline_alignment: bool = True,
 ) -> SplitState:
-    placements = torch.jit.annotate(List[EmbeddingLocation], [])
-    offsets = torch.jit.annotate(List[int], [])
+    placements = torch.jit.annotate(list[EmbeddingLocation], [])
+    offsets = torch.jit.annotate(list[int], [])
     dev_size = 0
     host_size = 0
     uvm_size = 0
@@ -164,7 +165,7 @@ def inputs_to_device(
     offsets: torch.Tensor,
     per_sample_weights: Optional[torch.Tensor],
     bounds_check_warning: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     if bounds_check_warning.device.type == "meta":
         return indices, offsets, per_sample_weights
@@ -330,7 +331,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
             Options are `torch.int32` and `torch.int64`.
     """
-    embedding_specs: List[Tuple[str, int, int, SparseType, EmbeddingLocation]]
+    embedding_specs: list[tuple[str, int, int, SparseType, EmbeddingLocation]]
     record_cache_metrics: RecordCacheMetrics
     # pyre-fixme[13]: Attribute `cache_miss_counter` is never initialized.
     cache_miss_counter: torch.Tensor
@@ -345,15 +346,15 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     def __init__(  # noqa C901
         self,
-        embedding_specs: List[
-            Tuple[str, int, int, SparseType, EmbeddingLocation]
+        embedding_specs: list[
+            tuple[str, int, int, SparseType, EmbeddingLocation]
         ],  # tuple of (feature_names, rows, dims, SparseType, EmbeddingLocation/placement)
-        feature_table_map: Optional[List[int]] = None,  # [T]
-        index_remapping: Optional[List[Tensor]] = None,
+        feature_table_map: Optional[list[int]] = None,  # [T]
+        index_remapping: Optional[list[Tensor]] = None,
         pooling_mode: PoolingMode = PoolingMode.SUM,
         device: Optional[Union[str, int, torch.device]] = None,
         bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
-        weight_lists: Optional[List[Tuple[Tensor, Optional[Tensor]]]] = None,
+        weight_lists: Optional[list[tuple[Tensor, Optional[Tensor]]]] = None,
         pruning_hash_load_factor: float = 0.5,
         use_array_for_index_remapping: bool = True,
         output_dtype: SparseType = SparseType.FP16,
@@ -372,7 +373,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
         cacheline_alignment: bool = True,
         uvm_host_mapped: bool = False,  # True to use cudaHostAlloc; False to use cudaMallocManaged.
         reverse_qparam: bool = False,  # True to load qparams at end of each row; False to load qparam at begnning of each row.
-        feature_names_per_table: Optional[List[List[str]]] = None,
+        feature_names_per_table: Optional[list[list[str]]] = None,
         indices_dtype: torch.dtype = torch.int32,  # Used for construction of the remap_indices tensors.  Should match the dtype of the indices passed in the forward() call (INT32 or INT64).
     ) -> None:  # noqa C901  # tuple of (rows, dims,)
         super(IntNBitTableBatchedEmbeddingBagsCodegen, self).__init__()
@@ -405,14 +406,14 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
         self.indices_dtype = indices_dtype
         # (feature_names, rows, dims, weights_tys, locations) = zip(*embedding_specs)
         # Pyre workaround
-        self.feature_names: List[str] = [e[0] for e in embedding_specs]
+        self.feature_names: list[str] = [e[0] for e in embedding_specs]
         self.cache_load_factor: float = cache_load_factor
         self.cache_sets: int = cache_sets
         self.cache_reserved_memory: float = cache_reserved_memory
-        rows: List[int] = [e[1] for e in embedding_specs]
-        dims: List[int] = [e[2] for e in embedding_specs]
-        weights_tys: List[SparseType] = [e[3] for e in embedding_specs]
-        locations: List[EmbeddingLocation] = [e[4] for e in embedding_specs]
+        rows: list[int] = [e[1] for e in embedding_specs]
+        dims: list[int] = [e[2] for e in embedding_specs]
+        weights_tys: list[SparseType] = [e[3] for e in embedding_specs]
+        locations: list[EmbeddingLocation] = [e[4] for e in embedding_specs]
         # if target device is meta then we set use_cpu based on the embedding location
         # information in embedding_specs.
         if self.current_device.type == "meta":
@@ -452,7 +453,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
         T_ = len(self.embedding_specs)
         assert T_ > 0
-        self.feature_table_map: List[int] = (
+        self.feature_table_map: list[int] = (
             feature_table_map if feature_table_map is not None else list(range(T_))
         )
         T = len(self.feature_table_map)
@@ -635,6 +636,8 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
             self.fp8_exponent_bits = -1
             self.fp8_exponent_bias = -1
+        self.bounds_check_version: int = get_bounds_check_version_for_platform()
     @torch.jit.ignore
     def log(self, msg: str) -> None:
         """
@@ -673,7 +676,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
         return self.table_wise_cache_miss
     @torch.jit.export
-    def get_feature_num_per_table(self) -> List[int]:
+    def get_feature_num_per_table(self) -> list[int]:
         if self.feature_names_per_table is None:
             return []
         return [len(feature_names) for feature_names in self.feature_names_per_table]
@@ -975,6 +978,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
                     self.bounds_check_mode_int,
                     self.bounds_check_warning,
                     per_sample_weights,
+                    bounds_check_version=self.bounds_check_version,
                 )
         # Index remapping changes input indices, and some of them becomes -1 (prunned rows).
@@ -1017,6 +1021,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 self.bounds_check_mode_int,
                 self.bounds_check_warning,
                 per_sample_weights,
+                bounds_check_version=self.bounds_check_version,
             )
         # Note: CPU and CUDA ops use the same interface to facilitate JIT IR
         # generation for CUDA/CPU. For CPU op, we don't need weights_uvm and
@@ -1206,8 +1211,8 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
         dev_size: int,
         host_size: int,
         uvm_size: int,
-        placements: List[int],
-        offsets: List[int],
+        placements: list[int],
+        offsets: list[int],
         enforce_hbm: bool,
     ) -> None:
         assert not self.weight_initialized, "Weights have already been initialized."
@@ -1516,6 +1521,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 for i, weight in enumerate(weights):
                     weights[i] = (
                         weight[0].to(device),
+                        # pyre-fixme[16]: Undefined attribute: `Optional` has no attribute `to`.
                         weight[1].to(device) if weight[1] is not None else None,
                     )
             (
@@ -1596,7 +1602,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     @torch.jit.export
     def split_embedding_weights_with_scale_bias(
         self, split_scale_bias_mode: int = 1
-    ) -> List[Tuple[Tensor, Optional[Tensor], Optional[Tensor]]]:
+    ) -> list[tuple[Tensor, Optional[Tensor], Optional[Tensor]]]:
         """
         Returns a list of weights, split by table
         split_scale_bias_mode:
@@ -1605,7 +1611,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
             2: return weights, scale, bias.
         """
         assert self.weight_initialized
-        splits: List[Tuple[Tensor, Optional[Tensor], Optional[Tensor]]] = []
+        splits: list[tuple[Tensor, Optional[Tensor], Optional[Tensor]]] = []
         for t, (_, rows, dim, weight_ty, _) in enumerate(self.embedding_specs):
             placement = self.weights_physical_placements[t]
             if (
@@ -1730,12 +1736,12 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
         # the second with scale_bias.
         # This should've been named as split_scale_bias.
         # Keep as is for backward compatibility.
-    ) -> List[Tuple[Tensor, Optional[Tensor]]]:
+    ) -> list[tuple[Tensor, Optional[Tensor]]]:
         """
         Returns a list of weights, split by table
         """
         # fmt: off
-        splits: List[Tuple[Tensor, Optional[Tensor], Optional[Tensor]]] = (
+        splits: list[tuple[Tensor, Optional[Tensor], Optional[Tensor]]] = (
             self.split_embedding_weights_with_scale_bias(
                 split_scale_bias_mode=(1 if split_scale_shifts else 0)
             )
@@ -1773,7 +1779,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
             )
     def assign_embedding_weights(
-        self, q_weight_list: List[Tuple[Tensor, Optional[Tensor]]]
+        self, q_weight_list: list[tuple[Tensor, Optional[Tensor]]]
     ) -> None:
         """
         Assigns self.split_embedding_weights() with values from the input list of weights and scale_shifts.
@@ -1785,6 +1791,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
             dest_weight[0].copy_(input_weight[0])
             if input_weight[1] is not None:
                 assert dest_weight[1] is not None
+                # pyre-fixme[16]: Undefined attribute: `Optional` has no attribute `copy_`.
                 dest_weight[1].copy_(input_weight[1])
             else:
                 assert dest_weight[1] is None
@@ -1792,11 +1799,11 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     @torch.jit.export
     def set_index_remappings_array(
         self,
-        index_remapping: List[Tensor],
+        index_remapping: list[Tensor],
     ) -> None:
-        rows: List[int] = [e[1] for e in self.embedding_specs]
+        rows: list[int] = [e[1] for e in self.embedding_specs]
         index_remappings_array_offsets = [0]
-        original_feature_rows = torch.jit.annotate(List[int], [])
+        original_feature_rows = torch.jit.annotate(list[int], [])
         last_offset = 0
         for t, mapping in enumerate(index_remapping):
             if mapping is not None:
@@ -1835,11 +1842,11 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     def set_index_remappings(
         self,
-        index_remapping: List[Tensor],
+        index_remapping: list[Tensor],
         pruning_hash_load_factor: float = 0.5,
         use_array_for_index_remapping: bool = True,
     ) -> None:
-        rows: List[int] = [e[1] for e in self.embedding_specs]
+        rows: list[int] = [e[1] for e in self.embedding_specs]
         T = len(self.embedding_specs)
         # Hash mapping pruning
         if not use_array_for_index_remapping:
@@ -1909,7 +1916,7 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     def _embedding_inplace_update_per_table(
         self,
         update_table_idx: int,
-        update_row_indices: List[int],
+        update_row_indices: list[int],
         update_weights: Tensor,
     ) -> None:
         row_size = len(update_row_indices)
@@ -1934,9 +1941,9 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     @torch.jit.export
     def embedding_inplace_update(
         self,
-        update_table_indices: List[int],
-        update_row_indices: List[List[int]],
-        update_weights: List[Tensor],
+        update_table_indices: list[int],
+        update_row_indices: list[list[int]],
+        update_weights: list[Tensor],
     ) -> None:
         for i in range(len(update_table_indices)):
             self._embedding_inplace_update_per_table(
@@ -1947,8 +1954,8 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module):
     def embedding_inplace_update_internal(
         self,
-        update_table_indices: List[int],
-        update_row_indices: List[int],
+        update_table_indices: list[int],
+        update_row_indices: list[int],
         update_weights: Tensor,
     ) -> None:
         assert len(update_table_indices) == len(update_row_indices)