PyPI - fbgemm-gpu-hstu-nightly - Versions diffs - 2025.6.16__cp313-cp313-manylinux_2_28_x86_64.whl → 2025.6.18__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-hstu-nightly 2025.6.16__cp313-cp313-manylinux_2_28_x86_64.whl → 2025.6.18__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

fbgemm_gpu/config/feature_list.py CHANGED Viewed

@@ -60,6 +60,12 @@ class FeatureGateName(Enum):
     # Enable bounds_check_indices_v2
     BOUNDS_CHECK_INDICES_V2 = auto()
+    # Disable FP8 quantization vectorization
+    DISABLE_FP8_QUANT_VECTORIZATION = auto()
+    # Enable TBE input parameters extraction
+    TBE_REPORT_INPUT_PARAMS = auto()
     def is_enabled(self) -> bool:
         return FeatureGate.is_enabled(self)

fbgemm_gpu/docs/version.py CHANGED Viewed

@@ -6,6 +6,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-__version__: str = "2025.6.16"
+__version__: str = "2025.6.18"
 __target__: str = "hstu"
 __variant__: str = "cuda"

fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so CHANGED Viewed

Binary file

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/split_table_batched_embeddings_ops_common.py CHANGED Viewed

@@ -11,7 +11,7 @@
 import enum
 from dataclasses import dataclass
-from typing import List, NamedTuple, Tuple
+from typing import List, NamedTuple, Optional, Tuple
 import torch
 from torch import Tensor
@@ -60,6 +60,43 @@ class EmbeddingLocation(enum.IntEnum):
             raise ValueError(f"Cannot parse value into EmbeddingLocation: {key}")
+class EvictionPolicy(NamedTuple):
+    eviction_trigger_mode: int = (
+        0  # disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
+    )
+    eviction_strategy: int = (
+        0  # 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
+    )
+    eviction_step_intervals: Optional[int] = (
+        None  # trigger_step_interval if trigger mode is iteration
+    )
+    eviction_mem_threshold_gb: Optional[int] = (
+        None  # eviction trigger condition if trigger mode is mem_util
+    )
+    counter_thresholds: Optional[List[int]] = (
+        None  # count_thresholds for each table if eviction strategy is feature score
+    )
+    ttls_in_mins: Optional[List[int]] = (
+        None  # ttls_in_mins for each table if eviction strategy is timestamp
+    )
+    counter_decay_rates: Optional[List[float]] = (
+        None  # count_decay_rates for each table if eviction strategy is feature score
+    )
+    l2_weight_thresholds: Optional[List[float]] = (
+        None  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+    )
+    interval_for_insufficient_eviction_s: int = (
+        # wait at least # seconds before trigger next round of eviction, if last finished eviction is insufficient
+        # insufficient means we didn't evict enough rows, so we want to wait longer time to
+        # avoid another insufficient eviction
+        600
+    )
+    interval_for_sufficient_eviction_s: int = (
+        # wait at least # seconds before trigger next round of eviction, if last finished eviction is sufficient
+        60
+    )
 class KVZCHParams(NamedTuple):
     # global bucket id start and global bucket id end offsets for each logical table,
     # where start offset is inclusive and end offset is exclusive
@@ -69,6 +106,7 @@ class KVZCHParams(NamedTuple):
     bucket_sizes: List[int] = []
     # enable optimizer offloading or not
     enable_optimizer_offloading: bool = False
+    eviction_policy: Optional[EvictionPolicy] = None
     def validate(self) -> None:
         assert len(self.bucket_offsets) == len(self.bucket_sizes), (

fbgemm_gpu/split_table_batched_embeddings_ops_training.py CHANGED Viewed

@@ -51,6 +51,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops_training_common import (
     generate_vbe_metadata,
     is_torchdynamo_compiling,
 )
+from fbgemm_gpu.tbe.stats import TBEBenchmarkParamsReporter
 from fbgemm_gpu.tbe_input_multiplexer import (
     TBEInfo,
     TBEInputInfo,
@@ -1441,6 +1442,11 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             self._debug_print_input_stats_factory()
         )
+        # Get a reporter function pointer
+        self._report_input_params: Callable[..., None] = (
+            self.__report_input_params_factory()
+        )
         if optimizer == OptimType.EXACT_SGD and self.use_writeback_bwd_prehook:
             # Register writeback hook for Exact_SGD optimizer
             self.log(
@@ -1947,11 +1953,24 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             per_sample_weights,
             batch_size_per_feature_per_rank,
             force_cast_input_types=True,
+            prefetch_pipeline=False,
         )
         # Print input stats if enable (for debugging purpose only)
         self._debug_print_input_stats(indices, offsets, per_sample_weights)
+        # Extract and Write input stats if enable
+        self._report_input_params(
+            feature_rows=self.rows_per_table,
+            feature_dims=self.feature_dims,
+            iteration=self.iter.item() if hasattr(self, "iter") else 0,
+            indices=indices,
+            offsets=offsets,
+            op_id=self.uuid,
+            per_sample_weights=per_sample_weights,
+            batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+        )
         if not is_torchdynamo_compiling():
             # Mutations of nn.Module attr forces dynamo restart of Analysis which increases compilation time
@@ -2478,6 +2497,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             per_sample_weights=None,
             batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
             force_cast_input_types=False,
+            prefetch_pipeline=self.prefetch_pipeline,
         )
         with self._recording_to_timer(
@@ -3543,6 +3563,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
         per_sample_weights: Optional[Tensor] = None,
         batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
         force_cast_input_types: bool = True,
+        prefetch_pipeline: bool = False,
     ) -> Tuple[Tensor, Tensor, Optional[Tensor], invokers.lookup_args.VBEMetadata]:
         """
         Prepare TBE inputs as follows:
@@ -3613,9 +3634,17 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 per_sample_weights = per_sample_weights.float()
         if self.bounds_check_mode_int != BoundsCheckMode.NONE.value:
+            # Override the bounds check version based on prefetch_pipeline
+            use_bounds_check_v2 = self.bounds_check_version == 2 or prefetch_pipeline
+            bounds_check_version = (
+                2 if use_bounds_check_v2 else self.bounds_check_version
+            )
+            vbe = vbe_metadata.B_offsets is not None
             # Compute B info and VBE metadata for bounds_check_indices only if
             # VBE and bounds check indices v2 are used
-            if vbe and self.bounds_check_version == 2:
+            if vbe and use_bounds_check_v2:
                 B_offsets = vbe_metadata.B_offsets
                 B_offsets_rank_per_feature = vbe_metadata.B_offsets_rank_per_feature
                 output_offsets_feature_rank = vbe_metadata.output_offsets_feature_rank
@@ -3653,7 +3682,8 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 b_t_map=b_t_map,
                 info_B_num_bits=self.info_B_num_bits,
                 info_B_mask=self.info_B_mask,
-                bounds_check_version=self.bounds_check_version,
+                bounds_check_version=bounds_check_version,
+                prefetch_pipeline=prefetch_pipeline,
             )
         return indices, offsets, per_sample_weights, vbe_metadata
@@ -3792,6 +3822,39 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
             return _debug_print_input_stats_factory_impl
         return _debug_print_input_stats_factory_null
+    @torch.jit.ignore
+    def __report_input_params_factory(
+        self,
+    ) -> Callable[..., None]:
+        """
+        This function returns a function pointer based on the environment variable `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL`.
+        If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is set to a value greater than 0, it returns a function pointer that:
+        - Reports input parameters (TBEDataConfig).
+        - Writes the output as a JSON file.
+        If `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` is not set or is set to 0, it returns a dummy function pointer that performs no action.
+        """
+        @torch.jit.ignore
+        def __report_input_params_factory_null(
+            feature_rows: Tensor,
+            feature_dims: Tensor,
+            iteration: int,
+            indices: Tensor,
+            offsets: Tensor,
+            op_id: Optional[str] = None,
+            per_sample_weights: Optional[Tensor] = None,
+            batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
+        ) -> None:
+            pass
+        if self._feature_is_enabled(FeatureGateName.TBE_REPORT_INPUT_PARAMS):
+            reporter = TBEBenchmarkParamsReporter.create()
+            return reporter.report_stats
+        return __report_input_params_factory_null
 class DenseTableBatchedEmbeddingBagsCodegen(nn.Module):
     """

fbgemm_gpu/tbe/bench/tbe_data_config.py CHANGED Viewed

@@ -9,19 +9,11 @@
 import dataclasses
 import json
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Optional
-import numpy as np
 import torch
-from fbgemm_gpu.tbe.utils.common import get_device, round_up
-from fbgemm_gpu.tbe.utils.requests import (
-    generate_batch_sizes_from_stats,
-    generate_pooling_factors_from_stats,
-    get_table_batched_offsets_from_dense,
-    maybe_to_dtype,
-    TBERequest,
-)
+from fbgemm_gpu.tbe.utils.common import get_device
 from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
@@ -104,175 +96,3 @@ class TBEDataConfig:
     def _new_weights(self, size: int) -> Optional[torch.Tensor]:
         # Per-sample weights will always be FP32
         return None if not self.weighted else torch.randn(size, device=get_device())
-    def _generate_batch_sizes(self) -> Tuple[List[int], Optional[List[List[int]]]]:
-        if self.variable_B():
-            assert (
-                self.batch_params.vbe_num_ranks is not None
-            ), "vbe_num_ranks must be set for varaible batch size generation"
-            return generate_batch_sizes_from_stats(
-                self.batch_params.B,
-                self.T,
-                # pyre-ignore [6]
-                self.batch_params.sigma_B,
-                self.batch_params.vbe_num_ranks,
-                # pyre-ignore [6]
-                self.batch_params.vbe_distribution,
-            )
-        else:
-            return ([self.batch_params.B] * self.T, None)
-    def _generate_pooling_info(self, iters: int, Bs: List[int]) -> torch.Tensor:
-        if self.variable_L():
-            # Generate L from stats
-            _, L_offsets = generate_pooling_factors_from_stats(
-                iters,
-                Bs,
-                self.pooling_params.L,
-                # pyre-ignore [6]
-                self.pooling_params.sigma_L,
-                # pyre-ignore [6]
-                self.pooling_params.length_distribution,
-            )
-        else:
-            Ls = [self.pooling_params.L] * (sum(Bs) * iters)
-            L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
-        return L_offsets
-    def _generate_indices(
-        self,
-        iters: int,
-        Bs: List[int],
-        L_offsets: torch.Tensor,
-    ) -> torch.Tensor:
-        total_B = sum(Bs)
-        L_offsets_list = L_offsets.tolist()
-        indices_list = []
-        for it in range(iters):
-            # L_offsets is defined over the entire set of batches for a single iteration
-            start_offset = L_offsets_list[it * total_B]
-            end_offset = L_offsets_list[(it + 1) * total_B]
-            indices_list.append(
-                torch.ops.fbgemm.tbe_generate_indices_from_distribution(
-                    self.indices_params.heavy_hitters,
-                    self.indices_params.zipf_q,
-                    self.indices_params.zipf_s,
-                    # max_index = dimensions of the embedding table
-                    self.E,
-                    # num_indices = number of indices to generate
-                    end_offset - start_offset,
-                )
-            )
-        return torch.cat(indices_list)
-    def _build_requests_jagged(
-        self,
-        iters: int,
-        Bs: List[int],
-        Bs_feature_rank: Optional[List[List[int]]],
-        L_offsets: torch.Tensor,
-        all_indices: torch.Tensor,
-    ) -> List[TBERequest]:
-        total_B = sum(Bs)
-        all_indices = all_indices.flatten()
-        requests = []
-        for it in range(iters):
-            start_offset = L_offsets[it * total_B]
-            it_L_offsets = torch.concat(
-                [
-                    torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
-                    L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
-                ]
-            )
-            requests.append(
-                TBERequest(
-                    maybe_to_dtype(
-                        all_indices[start_offset : L_offsets[(it + 1) * total_B]],
-                        self.indices_params.index_dtype,
-                    ),
-                    maybe_to_dtype(
-                        it_L_offsets.to(get_device()), self.indices_params.offset_dtype
-                    ),
-                    self._new_weights(int(it_L_offsets[-1].item())),
-                    Bs_feature_rank if self.variable_B() else None,
-                )
-            )
-        return requests
-    def _build_requests_dense(
-        self, iters: int, all_indices: torch.Tensor
-    ) -> List[TBERequest]:
-        # NOTE: We're using existing code from requests.py to build the
-        # requests, and since the existing code requires 2D view of all_indices,
-        # the existing all_indices must be reshaped
-        all_indices = all_indices.reshape(iters, -1)
-        requests = []
-        for it in range(iters):
-            indices, offsets = get_table_batched_offsets_from_dense(
-                all_indices[it].view(
-                    self.T, self.batch_params.B, self.pooling_params.L
-                ),
-                use_cpu=self.use_cpu,
-            )
-            requests.append(
-                TBERequest(
-                    maybe_to_dtype(indices, self.indices_params.index_dtype),
-                    maybe_to_dtype(offsets, self.indices_params.offset_dtype),
-                    self._new_weights(
-                        self.T * self.batch_params.B * self.pooling_params.L
-                    ),
-                )
-            )
-        return requests
-    def generate_requests(
-        self,
-        iters: int = 1,
-    ) -> List[TBERequest]:
-        # Generate batch sizes
-        Bs, Bs_feature_rank = self._generate_batch_sizes()
-        # Generate pooling info
-        L_offsets = self._generate_pooling_info(iters, Bs)
-        # Generate indices
-        all_indices = self._generate_indices(iters, Bs, L_offsets)
-        # Build TBE requests
-        if self.variable_B() or self.variable_L():
-            return self._build_requests_jagged(
-                iters, Bs, Bs_feature_rank, L_offsets, all_indices
-            )
-        else:
-            return self._build_requests_dense(iters, all_indices)
-    def generate_embedding_dims(self) -> Tuple[int, List[int]]:
-        if self.mixed_dim:
-            Ds = [
-                round_up(
-                    np.random.randint(low=int(0.5 * self.D), high=int(1.5 * self.D)), 4
-                )
-                for _ in range(self.T)
-            ]
-            return (int(np.average(Ds)), Ds)
-        else:
-            return (self.D, [self.D] * self.T)
-    def generate_feature_requires_grad(self, size: int) -> torch.Tensor:
-        assert size <= self.T, "size of feature_requires_grad must be less than T"
-        weighted_requires_grad_tables = np.random.choice(
-            self.T, replace=False, size=(size,)
-        ).tolist()
-        return (
-            torch.tensor(
-                [1 if t in weighted_requires_grad_tables else 0 for t in range(self.T)]
-            )
-            .to(get_device())
-            .int()
-        )

fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py ADDED Viewed

@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from typing import List, Optional, Tuple
+import torch
+from fbgemm_gpu.tbe.bench import TBEDataConfig
+from fbgemm_gpu.tbe.utils.common import get_device, round_up
+from fbgemm_gpu.tbe.utils.requests import (
+    generate_batch_sizes_from_stats,
+    generate_pooling_factors_from_stats,
+    get_table_batched_offsets_from_dense,
+    maybe_to_dtype,
+    TBERequest,
+)
+def _generate_batch_sizes(
+    tbe_data_config: TBEDataConfig,
+) -> Tuple[List[int], Optional[List[List[int]]]]:
+    if tbe_data_config.variable_B():
+        assert (
+            tbe_data_config.batch_params.vbe_num_ranks is not None
+        ), "vbe_num_ranks must be set for varaible batch size generation"
+        return generate_batch_sizes_from_stats(
+            tbe_data_config.batch_params.B,
+            tbe_data_config.T,
+            # pyre-ignore [6]
+            tbe_data_config.batch_params.sigma_B,
+            tbe_data_config.batch_params.vbe_num_ranks,
+            # pyre-ignore [6]
+            tbe_data_config.batch_params.vbe_distribution,
+        )
+    else:
+        return ([tbe_data_config.batch_params.B] * tbe_data_config.T, None)
+def _generate_pooling_info(
+    tbe_data_config: TBEDataConfig, iters: int, Bs: List[int]
+) -> torch.Tensor:
+    if tbe_data_config.variable_L():
+        # Generate L from stats
+        _, L_offsets = generate_pooling_factors_from_stats(
+            iters,
+            Bs,
+            tbe_data_config.pooling_params.L,
+            # pyre-ignore [6]
+            tbe_data_config.pooling_params.sigma_L,
+            # pyre-ignore [6]
+            tbe_data_config.pooling_params.length_distribution,
+        )
+    else:
+        Ls = [tbe_data_config.pooling_params.L] * (sum(Bs) * iters)
+        L_offsets = torch.tensor([0] + Ls, dtype=torch.long).cumsum(0)
+    return L_offsets
+def _generate_indices(
+    tbe_data_config: TBEDataConfig,
+    iters: int,
+    Bs: List[int],
+    L_offsets: torch.Tensor,
+) -> torch.Tensor:
+    total_B = sum(Bs)
+    L_offsets_list = L_offsets.tolist()
+    indices_list = []
+    for it in range(iters):
+        # L_offsets is defined over the entire set of batches for a single iteration
+        start_offset = L_offsets_list[it * total_B]
+        end_offset = L_offsets_list[(it + 1) * total_B]
+        indices_list.append(
+            torch.ops.fbgemm.tbe_generate_indices_from_distribution(
+                tbe_data_config.indices_params.heavy_hitters,
+                tbe_data_config.indices_params.zipf_q,
+                tbe_data_config.indices_params.zipf_s,
+                # max_index = dimensions of the embedding table
+                tbe_data_config.E,
+                # num_indices = number of indices to generate
+                end_offset - start_offset,
+            )
+        )
+    return torch.cat(indices_list)
+def _build_requests_jagged(
+    tbe_data_config: TBEDataConfig,
+    iters: int,
+    Bs: List[int],
+    Bs_feature_rank: Optional[List[List[int]]],
+    L_offsets: torch.Tensor,
+    all_indices: torch.Tensor,
+) -> List[TBERequest]:
+    total_B = sum(Bs)
+    all_indices = all_indices.flatten()
+    requests = []
+    for it in range(iters):
+        start_offset = L_offsets[it * total_B]
+        it_L_offsets = torch.concat(
+            [
+                torch.zeros(1, dtype=L_offsets.dtype, device=L_offsets.device),
+                L_offsets[it * total_B + 1 : (it + 1) * total_B + 1] - start_offset,
+            ]
+        )
+        requests.append(
+            TBERequest(
+                maybe_to_dtype(
+                    all_indices[start_offset : L_offsets[(it + 1) * total_B]],
+                    tbe_data_config.indices_params.index_dtype,
+                ),
+                maybe_to_dtype(
+                    it_L_offsets.to(get_device()),
+                    tbe_data_config.indices_params.offset_dtype,
+                ),
+                tbe_data_config._new_weights(int(it_L_offsets[-1].item())),
+                Bs_feature_rank if tbe_data_config.variable_B() else None,
+            )
+        )
+    return requests
+def _build_requests_dense(
+    tbe_data_config: TBEDataConfig, iters: int, all_indices: torch.Tensor
+) -> List[TBERequest]:
+    # NOTE: We're using existing code from requests.py to build the
+    # requests, and since the existing code requires 2D view of all_indices,
+    # the existing all_indices must be reshaped
+    all_indices = all_indices.reshape(iters, -1)
+    requests = []
+    for it in range(iters):
+        indices, offsets = get_table_batched_offsets_from_dense(
+            all_indices[it].view(
+                tbe_data_config.T,
+                tbe_data_config.batch_params.B,
+                tbe_data_config.pooling_params.L,
+            ),
+            use_cpu=tbe_data_config.use_cpu,
+        )
+        requests.append(
+            TBERequest(
+                maybe_to_dtype(indices, tbe_data_config.indices_params.index_dtype),
+                maybe_to_dtype(offsets, tbe_data_config.indices_params.offset_dtype),
+                tbe_data_config._new_weights(
+                    tbe_data_config.T
+                    * tbe_data_config.batch_params.B
+                    * tbe_data_config.pooling_params.L
+                ),
+            )
+        )
+    return requests
+def generate_requests(
+    tbe_data_config: TBEDataConfig,
+    iters: int = 1,
+) -> List[TBERequest]:
+    # Generate batch sizes
+    Bs, Bs_feature_rank = _generate_batch_sizes(tbe_data_config)
+    # Generate pooling info
+    L_offsets = _generate_pooling_info(tbe_data_config, iters, Bs)
+    # Generate indices
+    all_indices = _generate_indices(tbe_data_config, iters, Bs, L_offsets)
+    # Build TBE requests
+    if tbe_data_config.variable_B() or tbe_data_config.variable_L():
+        return _build_requests_jagged(
+            tbe_data_config, iters, Bs, Bs_feature_rank, L_offsets, all_indices
+        )
+    else:
+        return _build_requests_dense(tbe_data_config, iters, all_indices)
+def generate_embedding_dims(tbe_data_config: TBEDataConfig) -> Tuple[int, List[int]]:
+    if tbe_data_config.mixed_dim:
+        Ds = [
+            round_up(
+                int(
+                    torch.randint(
+                        low=int(0.5 * tbe_data_config.D),
+                        high=int(1.5 * tbe_data_config.D),
+                        size=(1,),
+                    ).item()
+                ),
+                4,
+            )
+            for _ in range(tbe_data_config.T)
+        ]
+        return (sum(Ds) // len(Ds), Ds)
+    else:
+        return (tbe_data_config.D, [tbe_data_config.D] * tbe_data_config.T)
+def generate_feature_requires_grad(
+    tbe_data_config: TBEDataConfig, size: int
+) -> torch.Tensor:
+    assert (
+        size <= tbe_data_config.T
+    ), "size of feature_requires_grad must be less than T"
+    weighted_requires_grad_tables = torch.randperm(tbe_data_config.T)[:size].tolist()
+    return (
+        torch.tensor(
+            [
+                1 if t in weighted_requires_grad_tables else 0
+                for t in range(tbe_data_config.T)
+            ]
+        )
+        .to(get_device())
+        .int()
+    )

fbgemm_gpu/tbe/bench/tbe_data_config_loader.py CHANGED Viewed

@@ -11,8 +11,12 @@ import click
 import torch
 import yaml
-from .tbe_data_config import TBEDataConfig
-from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
+from fbgemm_gpu.tbe.bench.tbe_data_config import (
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+    TBEDataConfig,
+)
 class TBEDataConfigLoader:

fbgemm_gpu/tbe/ssd/training.py CHANGED Viewed

@@ -248,6 +248,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             self.total_hash_size_bits: int = 0
         else:
             self.total_hash_size_bits: int = int(log2(float(hash_size_cumsum[-1])) + 1)
+        self.register_buffer(
+            "table_hash_size_cumsum",
+            torch.tensor(
+                hash_size_cumsum, device=self.current_device, dtype=torch.int64
+            ),
+        )
         # The last element is to easily access # of rows of each table by
         self.total_hash_size_bits = int(log2(float(hash_size_cumsum[-1])) + 1)
         self.total_hash_size: int = hash_size_cumsum[-1]
@@ -288,6 +294,10 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
             "feature_dims",
             torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
         )
+        self.register_buffer(
+            "table_dims",
+            torch.tensor(dims, device="cpu", dtype=torch.int64),
+        )
         (info_B_num_bits_, info_B_mask_) = torch.ops.fbgemm.get_infos_metadata(
             self.D_offsets,  # unused tensor
@@ -518,6 +528,7 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 logging.warning("dist is not initialized, treating as single gpu cases")
                 tbe_unique_id = SSDTableBatchedEmbeddingBags._local_instance_index
         self.tbe_unique_id = tbe_unique_id
+        self.l2_cache_size = l2_cache_size
         logging.info(f"tbe_unique_id: {tbe_unique_id}")
         if self.backend_type == BackendType.SSD:
             logging.info(
@@ -564,12 +575,12 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 self.res_params.table_offsets,
                 self.res_params.table_sizes,
                 (
-                    tensor_pad4(self.feature_dims.cpu())
+                    tensor_pad4(self.table_dims)
                     if self.enable_optimizer_offloading
                     else None
                 ),
                 (
-                    self.hash_size_cumsum.cpu()
+                    self.table_hash_size_cumsum.cpu()
                     if self.enable_optimizer_offloading
                     else None
                 ),
@@ -609,28 +620,42 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                 f"feature_dims={self.feature_dims},"
                 f"hash_size_cumsum={self.hash_size_cumsum}"
             )
+            table_dims = (
+                tensor_pad4(self.table_dims)
+                if self.enable_optimizer_offloading
+                else None
+            )  # table_dims
+            eviction_config = None
+            if self.kv_zch_params and self.kv_zch_params.eviction_policy:
+                eviction_mem_threshold_gb = (
+                    self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
+                    if self.kv_zch_params.eviction_policy.eviction_mem_threshold_gb
+                    else self.l2_cache_size
+                )
+                eviction_config = torch.classes.fbgemm.FeatureEvictConfig(
+                    self.kv_zch_params.eviction_policy.eviction_trigger_mode,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
+                    self.kv_zch_params.eviction_policy.eviction_strategy,  # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
+                    self.kv_zch_params.eviction_policy.eviction_step_intervals,  # trigger_step_interval if trigger mode is iteration
+                    eviction_mem_threshold_gb,  # mem_util_threshold_in_GB if trigger mode is mem_util
+                    self.kv_zch_params.eviction_policy.ttls_in_mins,  # ttls_in_mins for each table if eviction strategy is timestamp
+                    self.kv_zch_params.eviction_policy.counter_thresholds,  # counter_thresholds for each table if eviction strategy is feature score
+                    self.kv_zch_params.eviction_policy.counter_decay_rates,  # counter_decay_rates for each table if eviction strategy is feature score
+                    self.kv_zch_params.eviction_policy.l2_weight_thresholds,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                    table_dims.tolist() if table_dims is not None else None,
+                    self.kv_zch_params.eviction_policy.interval_for_insufficient_eviction_s,
+                    self.kv_zch_params.eviction_policy.interval_for_sufficient_eviction_s,
+                )
             self._ssd_db = torch.classes.fbgemm.DramKVEmbeddingCacheWrapper(
                 self.cache_row_dim,
                 ssd_uniform_init_lower,
                 ssd_uniform_init_upper,
-                0,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
-                0,  # trigger_step_interval if trigger mode is iteration
-                0,  # mem_util_threshold_in_GB if trigger mode is mem_util
-                0,  # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
-                None,  # count_thresholds for each table if eviction strategy is feature score
-                None,  # ttls_in_mins for each table if eviction strategy is timestamp
-                None,  # count_decay_rates for each table if eviction strategy is feature score
-                None,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                eviction_config,
                 ssd_rocksdb_shards,  # num_shards
                 ssd_rocksdb_shards,  # num_threads
                 weights_precision.bit_rate(),  # row_storage_bitwidth
+                table_dims,
                 (
-                    tensor_pad4(self.feature_dims.cpu())
-                    if self.enable_optimizer_offloading
-                    else None
-                ),  # table_dims
-                (
-                    self.hash_size_cumsum.cpu()
+                    self.table_hash_size_cumsum.cpu()
                     if self.enable_optimizer_offloading
                     else None
                 ),  # hash_size_cumsum
@@ -2434,6 +2459,13 @@ class SSDTableBatchedEmbeddingBags(nn.Module):
                     f"created snapshot for weight states: {snapshot_handle}, latency: {(time.time() - start_time) * 1000} ms"
                 )
         elif self.backend_type == BackendType.DRAM:
+            # if there is any ongoing eviction, lets wait until eviction is finished before state_dict
+            # so that we can reach consistent model state before/after state_dict
+            evict_wait_start_time = time.time()
+            self.ssd_db.wait_until_eviction_done()
+            logging.info(
+                f"state_dict wait for ongoing eviction: {time.time() - evict_wait_start_time} s"
+            )
             self.flush(force=should_flush)
         return snapshot_handle, checkpoint_handle

fbgemm_gpu/tbe/stats/bench_params_reporter.py CHANGED Viewed

@@ -8,26 +8,28 @@
 # pyre-strict
 import io
+import json
 import logging
 import os
 from typing import List, Optional
 import fbgemm_gpu  # noqa F401
-import numpy as np  # usort:skip
 import torch  # usort:skip
-from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
-    SplitTableBatchedEmbeddingBagsCodegen,
-)
-from fbgemm_gpu.tbe.bench import (
+from fbgemm_gpu.tbe.bench.tbe_data_config import (
     BatchParams,
     IndicesParams,
     PoolingParams,
     TBEDataConfig,
 )
-# pyre-ignore[16]
-open_source: bool = getattr(fbgemm_gpu, "open_source", False)
+open_source: bool = False
+try:
+    # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
+    if getattr(fbgemm_gpu, "open_source", False):
+        open_source = True
+except Exception:
+    pass
 if open_source:
     from fbgemm_gpu.utils import FileStore
@@ -43,7 +45,8 @@ class TBEBenchmarkParamsReporter:
     def __init__(
         self,
         report_interval: int,
-        report_once: bool = False,
+        report_iter_start: int = 0,
+        report_iter_end: int = -1,
         bucket: Optional[str] = None,
         path_prefix: Optional[str] = None,
     ) -> None:
@@ -52,13 +55,30 @@ class TBEBenchmarkParamsReporter:
         Args:
             report_interval (int): The interval at which reports are generated.
-            report_once (bool, optional): If True, reporting occurs only once. Defaults to False.
+            report_iter_start (int): The start of the iteration range to capture. Defaults to 0.
+            report_iter_end (int): The end of the iteration range to capture. Defaults to -1 (last iteration).
             bucket (Optional[str], optional): The storage bucket for reports. Defaults to None.
             path_prefix (Optional[str], optional): The path prefix for report storage. Defaults to None.
         """
+        assert report_interval > 0, "report_interval must be greater than 0"
+        assert (
+            report_iter_start >= 0
+        ), "report_iter_start must be greater than or equal to 0"
+        assert (
+            report_iter_end >= -1
+        ), "report_iter_end must be greater than or equal to -1"
+        assert (
+            report_iter_end == -1 or report_iter_start <= report_iter_end
+        ), "report_iter_start must be less than or equal to report_iter_end"
         self.report_interval = report_interval
-        self.report_once = report_once
-        self.has_reported = False
+        self.report_iter_start = report_iter_start
+        self.report_iter_end = report_iter_end
+        if path_prefix is not None and path_prefix.endswith("/"):
+            path_prefix = path_prefix[:-1]
+        self.path_prefix = path_prefix
         default_bucket = "/tmp" if open_source else "tlparse_reports"
         bucket = (
@@ -68,22 +88,65 @@ class TBEBenchmarkParamsReporter:
         )
         self.filestore = FileStore(bucket)
+        if self.path_prefix is not None and not self.filestore.exists(self.path_prefix):
+            self.filestore.create_directory(self.path_prefix)
         self.logger: logging.Logger = logging.getLogger(__name__)
         self.logger.setLevel(logging.INFO)
+    @classmethod
+    def create(cls) -> "TBEBenchmarkParamsReporter":
+        """
+        This method returns an instance of TBEBenchmarkParamsReporter based on environment variables.
+        If the `FBGEMM_REPORT_INPUT_PARAMS_INTERVAL` environment variable is set to a value greater than 0, it creates an instance that:
+        - Reports input parameters (TBEDataConfig).
+        - Writes the output as a JSON file.
+        Additionally, the following environment variables are considered:
+        - `FBGEMM_REPORT_INPUT_PARAMS_ITER_START`: Specifies the start of the iteration range to capture.
+        - `FBGEMM_REPORT_INPUT_PARAMS_ITER_END`: Specifies the end of the iteration range to capture.
+        - `FBGEMM_REPORT_INPUT_PARAMS_BUCKET`: Specifies the bucket for reporting.
+        - `FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX`: Specifies the path prefix for reporting.
+        Returns:
+            TBEBenchmarkParamsReporter: An instance configured based on the environment variables.
+        """
+        report_interval = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_INTERVAL", "1")
+        )
+        report_iter_start = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_START", "0")
+        )
+        report_iter_end = int(
+            os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_ITER_END", "-1")
+        )
+        bucket = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_BUCKET", "")
+        path_prefix = os.environ.get("FBGEMM_REPORT_INPUT_PARAMS_PATH_PREFIX", "")
+        return cls(
+            report_interval=report_interval,
+            report_iter_start=report_iter_start,
+            report_iter_end=report_iter_end,
+            bucket=bucket,
+            path_prefix=path_prefix,
+        )
     def extract_params(
         self,
-        embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
+        feature_rows: torch.Tensor,
+        feature_dims: torch.Tensor,
         indices: torch.Tensor,
         offsets: torch.Tensor,
         per_sample_weights: Optional[torch.Tensor] = None,
         batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
     ) -> TBEDataConfig:
         """
-        Extracts parameters from the embedding operation, input indices and offsets to create a TBEDataConfig.
+        Extracts parameters from the embedding operation, input indices, and offsets to create a TBEDataConfig.
         Args:
-            embedding_op (SplitTableBatchedEmbeddingBagsCodegen): The embedding operation.
+            feature_rows (torch.Tensor): Number of rows in each feature.
+            feature_dims (torch.Tensor): Number of dimensions in each feature.
             indices (torch.Tensor): The input indices tensor.
             offsets (torch.Tensor): The input offsets tensor.
             per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
@@ -92,24 +155,33 @@ class TBEBenchmarkParamsReporter:
         Returns:
             TBEDataConfig: The configuration data for TBE benchmarking.
         """
+        Es = feature_rows.tolist()
+        Ds = feature_dims.tolist()
+        assert len(Es) == len(
+            Ds
+        ), "feature_rows and feature_dims must have the same length"
         # Transfer indices back to CPU for EEG analysis
         indices_cpu = indices.cpu()
-        # Extract embedding table specs
-        embedding_specs = [
-            embedding_op.embedding_specs[t] for t in embedding_op.feature_table_map
-        ]
-        rowcounts = [embedding_spec[0] for embedding_spec in embedding_specs]
-        dims = [embedding_spec[1] for embedding_spec in embedding_specs]
         # Set T to be the number of features we are looking at
-        T = len(embedding_op.feature_table_map)
+        T = len(Ds)
         # Set E to be the mean of the rowcounts to avoid biasing
-        E = rowcounts[0] if len(set(rowcounts)) == 1 else np.ceil((np.mean(rowcounts)))
+        E = (
+            Es[0]
+            if len(set(Es)) == 1
+            else torch.ceil(torch.mean(torch.tensor(feature_rows)))
+        )
         # Set mixed_dim to be True if there are multiple dims
-        mixed_dim = len(set(dims)) > 1
+        mixed_dim = len(set(Ds)) > 1
         # Set D to be the mean of the dims to avoid biasing
-        D = dims[0] if not mixed_dim else np.ceil((np.mean(dims)))
+        D = (
+            Ds[0]
+            if not mixed_dim
+            else torch.ceil(torch.mean(torch.tensor(feature_dims)))
+        )
         # Compute indices distribution parameters
         heavy_hitters, q, s, _, _ = torch.ops.fbgemm.tbe_estimate_indices_distribution(
@@ -123,8 +195,18 @@ class TBEBenchmarkParamsReporter:
         batch_params = BatchParams(
             B=((offsets.numel() - 1) // T),
             sigma_B=(
-                np.ceil(
-                    np.std([b for bs in batch_size_per_feature_per_rank for b in bs])
+                int(
+                    torch.ceil(
+                        torch.std(
+                            torch.tensor(
+                                [
+                                    b
+                                    for bs in batch_size_per_feature_per_rank
+                                    for b in bs
+                                ]
+                            )
+                        )
+                    )
                 )
                 if batch_size_per_feature_per_rank
                 else None
@@ -138,11 +220,19 @@ class TBEBenchmarkParamsReporter:
         )
         # Compute pooling parameters
-        bag_sizes = (offsets[1:] - offsets[:-1]).tolist()
+        bag_sizes = offsets[1:] - offsets[:-1]
         mixed_bag_sizes = len(set(bag_sizes)) > 1
         pooling_params = PoolingParams(
-            L=np.ceil(np.mean(bag_sizes)) if mixed_bag_sizes else bag_sizes[0],
-            sigma_L=(np.ceil(np.std(bag_sizes)) if mixed_bag_sizes else None),
+            L=(
+                int(torch.ceil(torch.mean(bag_sizes.float())))
+                if mixed_bag_sizes
+                else int(bag_sizes[0])
+            ),
+            sigma_L=(
+                int(torch.ceil(torch.std(bag_sizes.float())))
+                if mixed_bag_sizes
+                else None
+            ),
             length_distribution=("normal" if mixed_bag_sizes else None),
         )
@@ -160,34 +250,58 @@ class TBEBenchmarkParamsReporter:
     def report_stats(
         self,
-        embedding_op: SplitTableBatchedEmbeddingBagsCodegen,
+        feature_rows: torch.Tensor,
+        feature_dims: torch.Tensor,
+        iteration: int,
         indices: torch.Tensor,
         offsets: torch.Tensor,
+        op_id: str = "",
         per_sample_weights: Optional[torch.Tensor] = None,
         batch_size_per_feature_per_rank: Optional[List[List[int]]] = None,
     ) -> None:
         """
-        Reports the configuration of the embedding operation and input data then writes the TBE configuration to the filestore.
+        Reports the configuration of the embedding operation and input data, then writes the TBE configuration to the filestore.
         Args:
-            embedding_op (SplitTableBatchedEmbeddingBagsCodegen): The embedding operation.
+            feature_rows (torch.Tensor): Number of rows in each feature.
+            feature_dims (torch.Tensor): Number of dimensions in each feature.
+            iteration (int): The current iteration number.
             indices (torch.Tensor): The input indices tensor.
             offsets (torch.Tensor): The input offsets tensor.
+            op_id (str, optional): The operation identifier. Defaults to an empty string.
             per_sample_weights (Optional[torch.Tensor], optional): Weights for each sample. Defaults to None.
             batch_size_per_feature_per_rank (Optional[List[List[int]]], optional): Batch sizes per feature per rank. Defaults to None.
         """
-        if embedding_op.iter.item() % self.report_interval == 0 and (
-            not self.report_once or (self.report_once and not self.has_reported)
+        if (
+            (iteration - self.report_iter_start) % self.report_interval == 0
+            and (iteration >= self.report_iter_start)
+            and (self.report_iter_end == -1 or iteration <= self.report_iter_end)
         ):
             # Extract TBE config
             config = self.extract_params(
-                embedding_op, indices, offsets, per_sample_weights
+                feature_rows=feature_rows,
+                feature_dims=feature_dims,
+                indices=indices,
+                offsets=offsets,
+                per_sample_weights=per_sample_weights,
+                batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
             )
+            config.json()
+            # Ad-hoc fix for adding Es and Ds to JSON output
+            # TODO: Remove this once we moved Es and Ds to be part of TBEDataConfig
+            adhoc_config = config.dict()
+            adhoc_config["Es"] = feature_rows.tolist()
+            adhoc_config["Ds"] = feature_dims.tolist()
+            if batch_size_per_feature_per_rank:
+                adhoc_config["Bs"] = [
+                    sum(batch_size_per_feature_per_rank[f])
+                    for f in range(len(adhoc_config["Es"]))
+                ]
             # Write the TBE config to FileStore
             self.filestore.write(
-                f"tbe-{embedding_op.uuid}-config-estimation-{embedding_op.iter.item()}.json",
-                io.BytesIO(config.json(format=True).encode()),
+                f"{self.path_prefix}/tbe-{op_id}-config-estimation-{iteration}.json",
+                io.BytesIO(json.dumps(adhoc_config, indent=2).encode()),
             )
-            self.has_reported = True

fbgemm_gpu/tbe/utils/requests.py CHANGED Viewed

@@ -14,9 +14,6 @@ import numpy as np
 import numpy.typing as npt
 import torch
-# pyre-fixme[21]: Could not find name `default_rng` in `numpy.random` (stubbed).
-from numpy.random import default_rng
 from .common import get_device
 from .offsets import get_table_batched_offsets_from_dense
@@ -309,11 +306,9 @@ def generate_indices_zipf(
             indices, torch.tensor([0, L], dtype=torch.long), True
         )
     if deterministic_output:
-        rng = default_rng(12345)
-    else:
-        rng = default_rng()
+        np.random.seed(12345)
     permutation = torch.as_tensor(
-        rng.choice(E, size=indices.max().item() + 1, replace=False)
+        np.random.choice(E, size=indices.max().item() + 1, replace=False)
     )
     indices = permutation.gather(0, indices.flatten())
     indices = indices.to(get_device()).int()

fbgemm_gpu/utils/filestore.py CHANGED Viewed

@@ -11,7 +11,6 @@
 import io
 import logging
 import os
-import shutil
 from dataclasses import dataclass
 from pathlib import Path
 from typing import BinaryIO, Union
@@ -76,7 +75,12 @@ class FileStore:
             elif isinstance(raw_input, Path):
                 if not os.path.exists(raw_input):
                     raise FileNotFoundError(f"File {raw_input} does not exist")
-                shutil.copyfile(raw_input, filepath)
+                # Open the source file and destination file, and copy the contents
+                with open(raw_input, "rb") as src_file, open(
+                    filepath, "wb"
+                ) as dst_file:
+                    while chunk := src_file.read(4096):  # Read 4 KB at a time
+                        dst_file.write(chunk)
             elif isinstance(raw_input, io.BytesIO) or isinstance(raw_input, BinaryIO):
                 with open(filepath, "wb") as file:
@@ -155,4 +159,53 @@ class FileStore:
             True if file exists, False otherwise.
         """
         filepath = f"{self.bucket}/{path}"
-        return os.path.isfile(filepath)
+        return os.path.exists(filepath)
+    def create_directory(self, path: str) -> "FileStore":
+        """
+        Creates a directory in the file store.
+        Args:
+            path (str): The path of the node or symlink to a directory (relative
+            to `self.bucket`) to be created.
+        Returns:
+            self.  This allows for method-chaining.
+        """
+        filepath = f"{self.bucket}/{path}"
+        event = f"creating directory {filepath}"
+        logger.info(f"FileStore: {event}")
+        try:
+            if not os.path.exists(filepath):
+                os.makedirs(filepath, exist_ok=True)
+        except Exception as e:
+            logger.error(f"FileStore: exception occurred when {event}: {e}")
+            raise e
+        return self
+    def remove_directory(self, path: str) -> "FileStore":
+        """
+        Removes a directory from the file store.
+        Args:
+            path (str): The path of the node or symlink to a directory (relative
+            to `self.bucket`) to be removed.
+        Returns:
+            self.  This allows for method-chaining.
+        """
+        filepath = f"{self.bucket}/{path}"
+        event = f"deleting {filepath}"
+        logger.info(f"FileStore: {event}")
+        try:
+            if os.path.isdir(filepath):
+                os.rmdir(filepath)
+        except Exception as e:
+            logger.error(f"Manifold: exception occurred when {event}: {e}")
+            raise e
+        return self

{fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_hstu_nightly
-Version: 2025.6.16
+Version: 2025.6.18
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org
@@ -40,9 +40,6 @@ PyTorch GPU operator libraries for training and inference.  The library provides
 efficient table batched embedding bag, data layout transformation, and
 quantization supports.
-FBGEMM_GPU is currently tested with CUDA 12.4 and 11.8 in CI, and with PyTorch
-packages (2.1+) that are built against those CUDA versions.
 See the full [Documentation](https://pytorch.org/FBGEMM) for more information
 on building, installing, and developing with FBGEMM_GPU, as well as the most
 up-to-date support matrix for this library.

{fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ fbgemm_gpu/__init__.py,sha256=BrIitwvFsRtKEk1ZBHFUi9j6ZUgoA5K7CvepoBez0u4,3419
 fbgemm_gpu/asmjit.so,sha256=1mgsQhqX1yiUdU9p2w3e7XNhDxhMprHy8qkFKYM01Ww,488288
 fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=u7LfseNeM5gGFQGLAMVO7h2QkFWEOL3ezV5RuhbZn4M,2928
 fbgemm_gpu/enums.py,sha256=GVuzF5cFTLzttkvlH1SdcGrxrppMhDSbQj_Vm_4zmEo,789
-fbgemm_gpu/fbgemm.so,sha256=3uXoJvcUn26NnKzoQJjqpIYYeWLAb2PjIKveQDLZNaE,5601928
+fbgemm_gpu/fbgemm.so,sha256=2giLGFkDpN5f6NtML_Din2J98LCdwJ0kgL_U3sbGoc0,5634864
 fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
 fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=kjWuWmQY8e2kMRwIPTzjGjyjV4syKPrphtHdsQTAjWM,5136
 fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=cUrEbRIvLFW_3Zmh07QkN4S1Cfvvge6TYO1VXBFCpz8,2752
@@ -15,15 +15,15 @@ fbgemm_gpu/split_embedding_inference_converter.py,sha256=ilVVowkTiY0WDpOYorj917T
 fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
 fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
 fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
-fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=qglNRKKuHkrKiTw90ACjZpMzcjHKXKV7ME3a8QHfQt4,8237
+fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=qbc1n-PPWKc75H0lXxK5kuCCprh4xEMS8A0TiE5fbHs,9906
 fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=bUDWa6IR0vGLDThgB3nmD1yfYa8_HD34B0dtLnd7thw,81692
-fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=YCLPSW9CXrRwMN5KEU6x0ESbutdhzKTaNOO8oN5kX7I,163875
+fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=IrGhStc8TSwvxjgPtwIVDmfjsXbThmh64pVulNhMR9M,166355
 fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=ktC10-nakOBpcmJNCOGQsxuBCP8XTwXJ2WeEgIg91tc,5455
 fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
 fbgemm_gpu/tbe_input_multiplexer.py,sha256=DjU7dPHgAT1avXGvgi8SFfw2Pq7yT8S_7IH8qCXoptA,3069
 fbgemm_gpu/uvm.py,sha256=-cZunsuvnAKUEQptIwdYVar_3hUE99FbQUsyfBVeXPE,925
 fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
-fbgemm_gpu/config/feature_list.py,sha256=kBXRV_3Hc-eC2zy5YGo_viS0t7awojQzmkHE7AgATuY,2267
+fbgemm_gpu/config/feature_list.py,sha256=04l_k0t6nkLRxnvSeO4ZjkGj_If9KQGl8PTl-HmxOIQ,2441
 fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
 fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
 fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
@@ -32,10 +32,10 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=NTcTm0q9h8W2B8PKPoic2fHsAaCbCYunSa_EYK0LtHQ,21382
-fbgemm_gpu/docs/version.py,sha256=BGPutkugBjOcws1FPbpPLjdGLdmpE_BwbCB4Zi7-eYk,315
+fbgemm_gpu/docs/version.py,sha256=gWwQLocgNkScd-zRubMAconDahzJwrllhvezB_jXyQs,315
 fbgemm_gpu/experimental/hstu/__init__.py,sha256=KNisP6qDMwgjgxkGlqUZRNjJ_8o8R-cTmm3HxF7pSqI,1564
 fbgemm_gpu/experimental/hstu/cuda_hstu_attention.py,sha256=5425GRjJuzpXQC-TowgQOCFjZmOwv_EK0lKbURhHBTQ,9920
-fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so,sha256=EGyEgarOZrQjeyF8W0EB_K4AjFv-QziJK4Cefn23VoU,352287576
+fbgemm_gpu/experimental/hstu/fbgemm_gpu_experimental_hstu.so,sha256=9mp_lqP2V4gBxmINu0tafkVMdl5Qu1JiFlSP6Jpglrk,352287576
 fbgemm_gpu/quantize/__init__.py,sha256=pftciXHE7csekDFkl7Ui1AWglVMMnSrOO04mREnUdb0,921
 fbgemm_gpu/quantize/quantize_ops.py,sha256=25AIOv9n2UoxamMUaI6EK1Ur4gSHxbZIReHBtgOjjCs,2228
 fbgemm_gpu/sll/__init__.py,sha256=rgXh35-OFUE54E9gGBq3NGxouGvgMv2ccY2bWUTxONY,4191
@@ -64,8 +64,9 @@ fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=T8Wa1PeRyFZ0Ge-SErHQEYDY8LvHVoCV_qQlE_6kE
 fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=mdG3JZwgclp6DiVwQSKl5jrirLSId4OuM64knj9TkEk,4973
 fbgemm_gpu/tbe/bench/eval_compression.py,sha256=bINVERk42VJDSdenQHKWApmRMrW8rhkevOgE0hDR-S8,3499
 fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
-fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=uvepBrbzERALBB-RPZVGFra4a8ALCqsOe9X6iWpqAyU,9413
-fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=eSzD7JP4oIo3LP6nVhqwfxfeDARSy-TS48ue-5duodE,7519
+fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=0NxlQtvBb4BBeBiK8DaMVByyJjgzFFgrAsGQt-EFqgM,2913
+fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=uIFdxCBgrz3_l6C9fmE2bMmULhK1eX5ZfB78Pz7tjkw,7312
+fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=ajDmXjxNLxtHu8728CsSZQmuT6nra82jTb9uJJE3yzI,7519
 fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=tuEQgffV-_zGS4zza1I3x9ZWOYGh9jl3Aal1g-78veE,5852
 fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
 fbgemm_gpu/tbe/cache/__init__.py,sha256=oM-g5nq0EXZgO79C6DhAl_Om9FTPC-WiaqclQCG3HTk,323
@@ -73,16 +74,16 @@ fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=mQkCl0xN8xUu5bjEWcOOFN
 fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
 fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
 fbgemm_gpu/tbe/ssd/inference.py,sha256=DTjwj3f6JaUMcecWoRNkZpRgXDJ-eE3grtixYwKb5DI,22829
-fbgemm_gpu/tbe/ssd/training.py,sha256=Dx-rJqjrD1A4U4MEVaP3OJl3CZz0VRSTWcukx5557Jw,131715
+fbgemm_gpu/tbe/ssd/training.py,sha256=GnhVZOxkgYoDgYOh34xL1pg5SwncSoLMv48mSHt4lQc,133710
 fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
 fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=uwwEdUiaVlnWZ_rQax2z28VYROfivdMqIdWLy8IZ6cE,7646
 fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
-fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=7XIWVObJOxSVUG73xsd_lVSuCFUQkMEGSWW--BoyCH0,7358
+fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=9HCR8Y0j_5oWGn1KRSNNYKGf_pmbGZyKT_KII8qf2Fc,11670
 fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
 fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
 fbgemm_gpu/tbe/utils/offsets.py,sha256=bs08kDiQ54oucZl6rmPLDs-bN6m1EMa1Wju06mCKZBY,1917
 fbgemm_gpu/tbe/utils/quantize.py,sha256=byjmzGpUjXD_UVAiBKyszmWlzYLkQxq5HBs6hBOuHZo,9185
-fbgemm_gpu/tbe/utils/requests.py,sha256=JR100WetSbj0X6FC431ysG8i7eb1T1Ej_GYt9DnNAjk,18053
+fbgemm_gpu/tbe/utils/requests.py,sha256=uyWxOYxsmSyd48UhPHvDEdxbb-_zDV60FuoKiuTzMdM,17899
 fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
 fbgemm_gpu/triton/common.py,sha256=wnkLd2a8fKpefymLL-LjNKEL4hDVSxFiF5g3aF8mzsw,2131
 fbgemm_gpu/triton/quantize.py,sha256=K5pqBQqs4YsD5m5TibZCbkd0E4Si0i_xcpIeF1B6jA0,26815
@@ -90,10 +91,10 @@ fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2M
 fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
 fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=AIC1G6_QBQtMVTyOyEV4ZKJyDzu36UI_9HDgWmZIRaA,29884
 fbgemm_gpu/utils/__init__.py,sha256=JQQNdcTTaEU6ptK-OW-ZQBwTFxEZZpWOtBXWwEZm39o,354
-fbgemm_gpu/utils/filestore.py,sha256=Zshw1dA03m9aHMMAtETdq4bgOLocyLhzlkAUoG8VkdM,4743
+fbgemm_gpu/utils/filestore.py,sha256=oVtbKGaPQki1JgbJCkrkElukOFVyxntQpSC0lYBKgho,6455
 fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,990
 fbgemm_gpu/utils/torch_library.py,sha256=dQcHv1qgpu5QYlJjxjd6oeHjtxnmmXzx3PL6vjCmxL4,4199
-fbgemm_gpu_hstu_nightly-2025.6.16.dist-info/METADATA,sha256=nRoS_AaW9zJoQpo3s5UeAdCY4pZvV5Fcbn9d1zKFohE,2794
-fbgemm_gpu_hstu_nightly-2025.6.16.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
-fbgemm_gpu_hstu_nightly-2025.6.16.dist-info/top_level.txt,sha256=2tlbTWLkPjhqvLF_6BbqKzkcPluSE-oPRVjI8axK76I,11
-fbgemm_gpu_hstu_nightly-2025.6.16.dist-info/RECORD,,
+fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/METADATA,sha256=rFbfG2H1ql2hm2bSjq3oSTKiMe3RXdKYafu9kp7D4qU,2654
+fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/WHEEL,sha256=Nkv8TSWVt7XcnRf1cdq5HOzycTl6Pjzlmn7gPSv4NiQ,108
+fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/top_level.txt,sha256=2tlbTWLkPjhqvLF_6BbqKzkcPluSE-oPRVjI8axK76I,11
+fbgemm_gpu_hstu_nightly-2025.6.18.dist-info/RECORD,,

{fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_hstu_nightly-2025.6.16.dist-info → fbgemm_gpu_hstu_nightly-2025.6.18.dist-info}/top_level.txt RENAMED Viewed

File without changes