PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py CHANGED Viewed

@@ -4,6 +4,8 @@
 ## Template Source: training/python/split_embedding_codegen_lookup_invoker.template
 ################################################################################
+__template_source_file__ = "training/python/split_embedding_codegen_lookup_invoker.template"
 #!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
@@ -16,6 +18,12 @@
 import torch
 import warnings
 from .lookup_args import *
 def invoke(
     common_args: CommonArgs,
     optimizer_args: OptimizerArgs,
@@ -26,6 +34,7 @@ def invoke(
     # only pass prev_iter_dev since prev_iter is never created on UVM
     prev_iter_dev: Optional[torch.Tensor] = None,
     gwd_lower_bound: float = 0.0,
+    mixed_D: bool = True,
     row_counter: Optional[Momentum] = None,
 ) -> torch.Tensor:
     # By design, the warning only shows up once
@@ -36,29 +45,245 @@ def invoke(
         times and build sizes!
         \033[0m"""
     )
+    # host_weights is only used for CPU training
+    use_cpu = common_args.host_weights.numel() > 0
     vbe_metadata = common_args.vbe_metadata
-    if not optimizer_args.use_rowwise_bias_correction or row_counter is None:
+    # pack weights
+    weights = [
+        common_args.dev_weights,
+        common_args.uvm_weights,
+        common_args.weights_placements,
+        common_args.weights_offsets,
+        common_args.lxu_cache_weights,
+    ] if not use_cpu else [
+        common_args.host_weights,
+        common_args.weights_placements,
+        common_args.weights_offsets,
+    ]
+    dict_aux_tensor: Dict[str, Optional[torch.Tensor]] = {
+        "B_offsets": vbe_metadata.B_offsets,
+        "vbe_output_offsets_feature_rank": vbe_metadata.output_offsets_feature_rank,
+        "vbe_B_offsets_rank_per_feature": vbe_metadata.B_offsets_rank_per_feature,
+        "lxu_cache_locations": common_args.lxu_cache_locations,
+        "uvm_cache_stats": common_args.uvm_cache_stats,
+        "vbe_output_offsets" : vbe_metadata.vbe_output_offsets,
+    }
+    dict_aux_int: Dict[str, int] = {
+        "iter": iter,
+        "info_B_num_bits": common_args.info_B_num_bits,
+        "info_B_mask": common_args.info_B_mask,
+    }
+    dict_aux_float: Dict[str, float] = {
+        "gwd_lower_bound": gwd_lower_bound,
+    }
+    dict_aux_bool: Dict[str, bool] = {
+        "is_experimental_tbe": common_args.is_experimental,
+        "use_uniq_cache_locations_bwd": common_args.use_uniq_cache_locations_bwd,
+        "use_homogeneous_placements": common_args.use_homogeneous_placements,
+        "apply_global_weight_decay": apply_global_weight_decay,
+        "mixed_D": mixed_D,
+    }
+    dict_optim_int: Dict[str, int] = {}
+    dict_optim_float: Dict[str, float] = {}
+    dict_optim_bool: Dict[str, bool] = {}
+    # Explicitly pass only prev_iter_dev for global weight decay, unless it already exists in optim arg
+    dict_aux_tensor["prev_iter_dev"] = prev_iter_dev
+    # optimizer_args # if optimizer == none
+    dict_aux_bool["gradient_clipping"] = optimizer_args.gradient_clipping
+    dict_aux_float["max_gradient"] = optimizer_args.max_gradient
+    dict_aux_bool["stochastic_rounding"] = optimizer_args.stochastic_rounding
+    dict_optim_float["eps"] = optimizer_args.eps
+    dict_optim_float["beta1"] = optimizer_args.beta1
+    dict_optim_float["beta2"] = optimizer_args.beta2
+    dict_optim_float["weight_decay"] = optimizer_args.weight_decay
+    dict_optim_bool["use_rowwise_bias_correction"] = optimizer_args.use_rowwise_bias_correction
+    momentum1_list = [
+        momentum1.dev,
+        momentum1.uvm,
+        momentum1.placements,
+        momentum1.offsets,
+    ] if not use_cpu else [
+        momentum1.host,
+        momentum1.placements,
+        momentum1.offsets,
+    ] if momentum1 is not None else None
+    momentum2_list = [
+        momentum2.dev,
+        momentum2.uvm,
+        momentum2.placements,
+        momentum2.offsets,
+    ] if not use_cpu else [
+        momentum2.host,
+        momentum2.placements,
+        momentum2.offsets,
+    ] if momentum2 is not None else None
+    if optimizer_args.use_rowwise_bias_correction and row_counter is not None:
+        row_counter_host = None # not supported on CPU
+        row_counter_dev = row_counter.dev
+        row_counter_uvm = row_counter.uvm
+        row_counter_offsets = row_counter.offsets
+        row_counter_placements = row_counter.placements
+    elif optimizer_args.use_rowwise_bias_correction:
+        assert False, "`use_rowwise_bias_correction` is set, `row_counter` cannot be None"
+    else:
+        row_counter_host = None
         row_counter_dev = None
         row_counter_uvm = None
         row_counter_offsets = None
         row_counter_placements = None
-    elif optimizer_args.use_rowwise_bias_correction and row_counter is None:
-        assert False, "use_rowwise_bias_correction is set but row_counter cannot be None"
+    aux_tensor: List[Optional[torch.Tensor]] = []
+    assert "B_offsets" in dict_aux_tensor, (
+        "B_offsets must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["B_offsets"])
+    assert "vbe_output_offsets_feature_rank" in dict_aux_tensor, (
+        "vbe_output_offsets_feature_rank must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["vbe_output_offsets_feature_rank"])
+    assert "vbe_B_offsets_rank_per_feature" in dict_aux_tensor, (
+        "vbe_B_offsets_rank_per_feature must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["vbe_B_offsets_rank_per_feature"])
+    assert "lxu_cache_locations" in dict_aux_tensor, (
+        "lxu_cache_locations must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["lxu_cache_locations"])
+    assert "uvm_cache_stats" in dict_aux_tensor, (
+        "uvm_cache_stats must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["uvm_cache_stats"])
+    assert "prev_iter_dev" in dict_aux_tensor, (
+        "prev_iter_dev must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["prev_iter_dev"])
+    assert "vbe_output_offsets" in dict_aux_tensor, (
+        "vbe_output_offsets must be in dict_aux_tensor. "
+        "Please check the frontend and backend version. "
+    )
+    aux_tensor.append(dict_aux_tensor["vbe_output_offsets"])
+    aux_int: List[int] = []
+    assert "iter" in dict_aux_int, (
+        "iter must be in dict_aux_int. "
+        "Please check the frontend and backend version. "
+    )
+    aux_int.append(dict_aux_int["iter"])
+    assert "info_B_num_bits" in dict_aux_int, (
+        "info_B_num_bits must be in dict_aux_int. "
+        "Please check the frontend and backend version. "
+    )
+    aux_int.append(dict_aux_int["info_B_num_bits"])
+    assert "info_B_mask" in dict_aux_int, (
+        "info_B_mask must be in dict_aux_int. "
+        "Please check the frontend and backend version. "
+    )
+    aux_int.append(dict_aux_int["info_B_mask"])
+    aux_float: List[float] = []
+    assert "gwd_lower_bound" in dict_aux_float, (
+        "gwd_lower_bound must be in dict_aux_float. "
+        "Please check the frontend and backend version. "
+    )
+    aux_float.append(dict_aux_float["gwd_lower_bound"])
+    assert "max_gradient" in dict_aux_float, (
+        "max_gradient must be in dict_aux_float. "
+        "Please check the frontend and backend version. "
+    )
+    aux_float.append(dict_aux_float["max_gradient"])
+    aux_bool: List[bool] = []
+    assert "is_experimental_tbe" in dict_aux_bool, (
+        "is_experimental_tbe must be in dict_aux_bool. "
+        "Please check the frontend and backend version. "
+    )
+    aux_bool.append(dict_aux_bool["is_experimental_tbe"])
+    assert "use_uniq_cache_locations_bwd" in dict_aux_bool, (
+        "use_uniq_cache_locations_bwd must be in dict_aux_bool. "
+        "Please check the frontend and backend version. "
+    )
+    aux_bool.append(dict_aux_bool["use_uniq_cache_locations_bwd"])
+    assert "use_homogeneous_placements" in dict_aux_bool, (
+        "use_homogeneous_placements must be in dict_aux_bool. "
+        "Please check the frontend and backend version. "
+    )
+    aux_bool.append(dict_aux_bool["use_homogeneous_placements"])
+    assert "apply_global_weight_decay" in dict_aux_bool, (
+        "apply_global_weight_decay must be in dict_aux_bool. "
+        "Please check the frontend and backend version. "
+    )
+    aux_bool.append(dict_aux_bool["apply_global_weight_decay"])
+    assert "gradient_clipping" in dict_aux_bool, (
+        "gradient_clipping must be in dict_aux_bool. "
+        "Please check the frontend and backend version. "
+    )
+    aux_bool.append(dict_aux_bool["gradient_clipping"])
+    assert "stochastic_rounding" in dict_aux_bool, (
+        "stochastic_rounding must be in dict_aux_bool. "
+        "Please check the frontend and backend version. "
+    )
+    aux_bool.append(dict_aux_bool["stochastic_rounding"])
+    assert "mixed_D" in dict_aux_bool, (
+        "mixed_D must be in dict_aux_bool. "
+        "Please check the frontend and backend version. "
+    )
+    aux_bool.append(dict_aux_bool["mixed_D"])
+    optim_tensor: List[Optional[torch.Tensor]] = []
+    # We cannot do list of optional tensorlist (optional tensorlist is Tensor?[]).
+    # we need to pack optimizer optional tensors in a flatten manner.
+    # We pack unified args (i.e., 5 items) since it's very confusing to pack/unpack per device (i.e, 3 for cpu and 4 for cuda)
+    # e.g., if we have optim optional tensors x and y, the optim_tensor will look like
+    # [x_host, x_dev, x_uvm, x_placements, x_offsets, y_host, y_dev, y_uvm, y_placements, y_offsets]
+    # ['row_counter']
+    # using .extend fails torch script
+    if row_counter is None:
+        optim_tensor.append(None)
+        optim_tensor.append(None)
+        optim_tensor.append(None)
+        optim_tensor.append(None)
+        optim_tensor.append(None)
     else:
-        row_counter_dev = row_counter.dev
-        row_counter_uvm = row_counter.uvm
-        row_counter_offsets = row_counter.offsets
-        row_counter_placements = row_counter.placements
+        optim_tensor.append(row_counter.host)
+        optim_tensor.append(row_counter.dev)
+        optim_tensor.append(row_counter.uvm)
+        optim_tensor.append(row_counter.placements)
+        optim_tensor.append(row_counter.offsets)
+    # optim_int
+    # optim_float
+    # ['momentum1', 'momentum2', 'learning_rate_tensor', 'optim_tensor', 'optim_float', 'optim_bool']
+    optim_float: List[float] = []
+    optim_float.append(dict_optim_float["eps"])
+    optim_float.append(dict_optim_float["beta1"])
+    optim_float.append(dict_optim_float["beta2"])
+    optim_float.append(dict_optim_float["weight_decay"])
+    # optim_bool
+    optim_bool: List[bool] = []
+    optim_bool.append(dict_optim_bool["use_rowwise_bias_correction"])
-    return torch.ops.fbgemm.split_embedding_codegen_lookup_adam_function(
+    return torch.ops.fbgemm.split_embedding_codegen_lookup_adam_function_pt2(
         # common_args
         placeholder_autograd_tensor=common_args.placeholder_autograd_tensor,
-        dev_weights=common_args.dev_weights,
-        uvm_weights=common_args.uvm_weights,
-        lxu_cache_weights=common_args.lxu_cache_weights,
-        weights_placements=common_args.weights_placements,
-        weights_offsets=common_args.weights_offsets,
+        # weights
+        weights=weights,
         D_offsets=common_args.D_offsets,
         total_D=common_args.total_D,
         max_D=common_args.max_D,
@@ -69,52 +294,35 @@ def invoke(
         pooling_mode=common_args.pooling_mode,
         indice_weights=common_args.indice_weights,
         feature_requires_grad=common_args.feature_requires_grad,
-        lxu_cache_locations=common_args.lxu_cache_locations,
-        uvm_cache_stats=common_args.uvm_cache_stats,
+        output_dtype=common_args.output_dtype,
         # VBE metadata
-        B_offsets=vbe_metadata.B_offsets,
-        vbe_output_offsets_feature_rank=vbe_metadata.output_offsets_feature_rank,
-        vbe_B_offsets_rank_per_feature=vbe_metadata.B_offsets_rank_per_feature,
         max_B=vbe_metadata.max_B,
         max_B_feature_rank=vbe_metadata.max_B_feature_rank,
         vbe_output_size=vbe_metadata.output_size,
-        # optimizer_args
-        gradient_clipping = optimizer_args.gradient_clipping,
-        max_gradient=optimizer_args.max_gradient,
-        stochastic_rounding=optimizer_args.stochastic_rounding, # if optimizer == none
-        # V1 interface still accepts learning_rate as float
-        learning_rate=optimizer_args.learning_rate,
-        eps=optimizer_args.eps,
-        beta1=optimizer_args.beta1,
-        beta2=optimizer_args.beta2,
-        weight_decay=optimizer_args.weight_decay,
+        vbe_output=vbe_metadata.vbe_output,
+        # aux_tensor
+        aux_tensor=aux_tensor,
+        # aux_int
+        aux_int=aux_int,
+        # aux_float
+        aux_float=aux_float,
+        # aux_bool
+        aux_bool=aux_bool,
+        learning_rate_tensor=common_args.learning_rate_tensor,
         # momentum1
-        momentum1_dev=momentum1.dev,
-        momentum1_uvm=momentum1.uvm,
-        momentum1_offsets=momentum1.offsets,
-        momentum1_placements=momentum1.placements,
+        momentum1 = momentum1_list,
         # momentum2
-        momentum2_dev=momentum2.dev,
-        momentum2_uvm=momentum2.uvm,
-        momentum2_offsets=momentum2.offsets,
-        momentum2_placements=momentum2.placements,
+        momentum2=momentum2_list,
         # prev_iter
-        prev_iter_dev=prev_iter_dev,
         # row_counter
-        row_counter_dev=row_counter_dev,
-        row_counter_uvm=row_counter_uvm,
-        row_counter_offsets=row_counter_offsets,
-        row_counter_placements=row_counter_placements,
-        use_rowwise_bias_correction=optimizer_args.use_rowwise_bias_correction,
-        # iter
-        iter=iter,
-        # max counter
+        # optim_tensor
+        optim_tensor=optim_tensor,
+        # optim_int
+        # optim_float
+        optim_float=optim_float,
+        # optim_bool
+        optim_bool=optim_bool,
+        # optim symint args
         # total_unique_indices
-        output_dtype=common_args.output_dtype,
-        is_experimental=common_args.is_experimental,
-        use_uniq_cache_locations_bwd=common_args.use_uniq_cache_locations_bwd,
-        use_homogeneous_placements=common_args.use_homogeneous_placements,
-        apply_global_weight_decay=apply_global_weight_decay,
-        gwd_lower_bound=gwd_lower_bound,
     )

fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py CHANGED Viewed

@@ -4,6 +4,8 @@
 ## Template Source: training/python/lookup_args.template
 ################################################################################
+__template_source_file__ = "training/python/lookup_args.template"
 #!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
@@ -25,6 +27,8 @@ class VBEMetadata(NamedTuple):
     max_B_feature_rank: int = -1
     max_B: int = -1
     output_size: int = -1
+    vbe_output: Optional[torch.Tensor] = None
+    vbe_output_offsets: Optional[torch.Tensor] = None
 class CommonArgs(NamedTuple):
@@ -52,71 +56,18 @@ class CommonArgs(NamedTuple):
     is_experimental: bool
     use_uniq_cache_locations_bwd: bool
     use_homogeneous_placements: bool
-class OptimizerArgs(NamedTuple):
-    stochastic_rounding: bool
-    gradient_clipping: bool
-    max_gradient: float
-    max_norm: float
-    learning_rate: float
-    eps: float
-    beta1: float
-    beta2: float
-    weight_decay: float
-    weight_decay_mode: int
-    eta: float
-    momentum: float
-    counter_halflife: int
-    adjustment_iter: int
-    adjustment_ub: float
-    learning_rate_mode: int
-    grad_sum_decay: int
-    tail_id_threshold: float
-    is_tail_id_thresh_ratio: int
-    total_hash_size: int  # Required for OptimType.NONE
-    weight_norm_coefficient: float
-    lower_bound: float
-    regularization_mode: int
-    use_rowwise_bias_correction: bool # Used for OptimType.ADAM
-class CommonArgsPT2(NamedTuple):
-    placeholder_autograd_tensor: torch.Tensor
-    dev_weights: torch.Tensor
-    host_weights: torch.Tensor
-    uvm_weights: torch.Tensor
-    lxu_cache_weights: torch.Tensor
-    weights_placements: torch.Tensor
-    weights_offsets: torch.Tensor
-    D_offsets: torch.Tensor
-    total_D: int
-    max_D: int
-    hash_size_cumsum: torch.Tensor
-    total_hash_size_bits: int
-    indices: torch.Tensor
-    offsets: torch.Tensor
-    pooling_mode: int
-    indice_weights: Optional[torch.Tensor]
-    feature_requires_grad: Optional[torch.Tensor]
-    lxu_cache_locations: torch.Tensor
-    uvm_cache_stats: Optional[torch.Tensor]
-    output_dtype: int
-    vbe_metadata: VBEMetadata
-    is_experimental: bool
-    use_uniq_cache_locations_bwd: bool
-    use_homogeneous_placements: bool
+    learning_rate_tensor: torch.Tensor
     info_B_num_bits: int
     info_B_mask: int
-class OptimizerArgsPT2(NamedTuple):
-    """
-    Optimizer arguments for PT2 interface
-    """
+# Do not add a parameter of Type tensor here. It will cause JIT script error due to a bug in PyTorch.
+# See more detail in D71010630.
+class OptimizerArgs(NamedTuple):
     stochastic_rounding: bool
     gradient_clipping: bool
     max_gradient: float
     max_norm: float
-    learning_rate_tensor: torch.Tensor
     eps: float
     beta1: float
     beta2: float

fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py CHANGED Viewed

@@ -4,6 +4,8 @@
 ## Template Source: training/python/lookup_args.template
 ################################################################################
+__template_source_file__ = "training/python/lookup_args.template"
 #!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
@@ -25,6 +27,8 @@ class VBEMetadata(NamedTuple):
     max_B_feature_rank: int = -1
     max_B: int = -1
     output_size: int = -1
+    vbe_output: Optional[torch.Tensor] = None
+    vbe_output_offsets: Optional[torch.Tensor] = None
 class CommonArgs(NamedTuple):
@@ -53,72 +57,19 @@ class CommonArgs(NamedTuple):
     use_uniq_cache_locations_bwd: bool
     use_homogeneous_placements: bool
     ssd_tensors: Dict[str, torch.Tensor]
-class OptimizerArgs(NamedTuple):
-    stochastic_rounding: bool
-    gradient_clipping: bool
-    max_gradient: float
-    max_norm: float
-    learning_rate: float
-    eps: float
-    beta1: float
-    beta2: float
-    weight_decay: float
-    weight_decay_mode: int
-    eta: float
-    momentum: float
-    counter_halflife: int
-    adjustment_iter: int
-    adjustment_ub: float
-    learning_rate_mode: int
-    grad_sum_decay: int
-    tail_id_threshold: float
-    is_tail_id_thresh_ratio: int
-    total_hash_size: int  # Required for OptimType.NONE
-    weight_norm_coefficient: float
-    lower_bound: float
-    regularization_mode: int
-    use_rowwise_bias_correction: bool # Used for OptimType.ADAM
-class CommonArgsPT2(NamedTuple):
-    placeholder_autograd_tensor: torch.Tensor
-    dev_weights: torch.Tensor
-    host_weights: torch.Tensor
-    uvm_weights: torch.Tensor
-    lxu_cache_weights: torch.Tensor
-    weights_placements: torch.Tensor
-    weights_offsets: torch.Tensor
-    D_offsets: torch.Tensor
-    total_D: int
-    max_D: int
-    hash_size_cumsum: torch.Tensor
-    total_hash_size_bits: int
-    indices: torch.Tensor
-    offsets: torch.Tensor
-    pooling_mode: int
-    indice_weights: Optional[torch.Tensor]
-    feature_requires_grad: Optional[torch.Tensor]
-    lxu_cache_locations: torch.Tensor
-    uvm_cache_stats: Optional[torch.Tensor]
-    output_dtype: int
-    vbe_metadata: VBEMetadata
-    is_experimental: bool
-    use_uniq_cache_locations_bwd: bool
-    use_homogeneous_placements: bool
+    enable_optimizer_offloading: bool
+    learning_rate_tensor: torch.Tensor
     info_B_num_bits: int
     info_B_mask: int
-    ssd_tensors: Dict[str, torch.Tensor]
-class OptimizerArgsPT2(NamedTuple):
-    """
-    Optimizer arguments for PT2 interface
-    """
+# Do not add a parameter of Type tensor here. It will cause JIT script error due to a bug in PyTorch.
+# See more detail in D71010630.
+class OptimizerArgs(NamedTuple):
     stochastic_rounding: bool
     gradient_clipping: bool
     max_gradient: float
     max_norm: float
-    learning_rate_tensor: torch.Tensor
     eps: float
     beta1: float
     beta2: float