PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/experimental/gen_ai/quantize.py ADDED Viewed

@@ -0,0 +1,307 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+# Helper functions for using FBGEMM quantized operators.
+import torch
+from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import quantize_fp8_row
+def pack_int4(x: torch.Tensor) -> torch.Tensor:
+    # Given int8 x, pack adjacent int4 values into a single int8.
+    low_x = x[:, ::2]
+    high_x = x[:, 1::2]
+    # High bits need to left shift, this also masks off extra bits.
+    high_x = torch.bitwise_left_shift(high_x, 4)
+    # Low bits need to have sign bits removed.
+    low_x = torch.bitwise_and(low_x, 0xF)
+    # Recombine into a single value with bitwise or.
+    return torch.bitwise_or(low_x, high_x).contiguous()
+def int4_row_quantize_zp(
+    x: torch.Tensor,
+    group_size: int = 128,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    n_bit = 4  # Number of target bits.
+    # Split input into chunks of group_size. This approach allows K that isnt divisible by group_size.
+    to_quant = torch.split(x.to(torch.float), group_size, dim=-1)
+    max_val = [chunk.amax(dim=1, keepdim=True) for chunk in to_quant]
+    min_val = [chunk.amin(dim=1, keepdim=True) for chunk in to_quant]
+    max_int = 2**n_bit - 1
+    min_int = 0
+    scales = [
+        (max_chunk - min_chunk).clamp(min=1e-6) / max_int
+        for max_chunk, min_chunk in zip(max_val, min_val)
+    ]
+    zeros = [
+        min_chunk + scale_chunk * (2 ** (n_bit - 1))
+        for min_chunk, scale_chunk in zip(min_val, scales)
+    ]
+    out = [
+        chunk.sub(min_chunk).div(scale_chunk).round().clamp_(min_int, max_int)
+        for chunk, min_chunk, scale_chunk in zip(to_quant, min_val, scales)
+    ]
+    # Recenter output and move to int8.
+    out = [(chunk - 2 ** (n_bit - 1)).to(dtype=torch.int8) for chunk in out]
+    # Recombine chunks.
+    out = torch.cat(out, dim=-1)
+    # Cutlass expects column major layout for scale and zero point,
+    # so we transpose here and make them contiguous.
+    scales = torch.cat(scales, dim=-1).t().contiguous()
+    zeros = torch.cat(zeros, dim=-1).t().contiguous()
+    return out, scales, zeros
+def int4_row_quantize(
+    x: torch.Tensor,
+    group_size: int = 128,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Helper function to quantize a tensor to int4 with groupwise scales.
+    Args:
+        x (Tensor): [N, K] Higher precision weight tensor to quantize.
+        group_size (int): Number of elements to calculate group scale for.
+    Returns:
+        wq (Tensor): [N, K] Quantized int4 tensor stored in int8 elements.
+        group_scale (Tensor): [K / group_size, N] FP32 Scale per group.
+    """
+    n_bit = 4  # Number of target bits.
+    # Split input into chunks of group_size. This approach allows K that isnt divisible by group_size.
+    to_quant = torch.split(x.to(torch.float), group_size, dim=-1)
+    max_val = [torch.abs(chunk).amax(dim=-1, keepdim=True) for chunk in to_quant]
+    max_int = 2 ** (n_bit - 1)
+    min_int = -(2 ** (n_bit - 1))
+    scales = [chunk.clamp(min=1e-6) / max_int for chunk in max_val]
+    out = [
+        chunk.div(chunk_scale).round().clamp_(min_int, max_int - 1)
+        for chunk, chunk_scale in zip(to_quant, scales)
+    ]
+    # Recombine chunks.
+    out = torch.cat(out, dim=-1)
+    # Cast to int8 and restore shape.
+    out = out.to(dtype=torch.int8)
+    # Scales should be in [num_groups, N] layout.
+    scales = torch.cat(scales, dim=-1).t().contiguous()
+    return out, scales
+def quantize_int4_preshuffle(
+    w: torch.Tensor, group_size: int = 128, dtype: str = "fp8", use_zp: bool = True
+) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Quantizes an input weight tensor to int4 using preshuffling and scale packing.
+    This function is intended to be used with fbgemms mixed dtype kernels and is expected
+    to be applied to weights ahead of time. As such, it is not perfectly optimized.
+    Args:
+        w (Tensor): [N, K] Higher precision weight tensor to quantize. May optionally have a batch dimension.
+        group_size (int): Number of elements to calculate group scale for, must be at least 128.
+        dtype (torch.dtype): Type of corresponding activations. Must be fp8 or bf16.
+        use_zp (bool): If true, uses zero points during weight quantization. Only relevant for bf16 currently.
+    Returns:
+        wq (Tensor): [N, K // 2] Quantized int4 weight tensor packed into int8 elements.
+        scales (Tuple[Tensor]): Scale tensors for the specified activation type. When FP8 is used,
+        scales is a tuple of row_scale ([N]) and group_scale ([K / group_size, 8, N]). When BF16 is
+        used, scales is a tuple of group_scale([K / group_size, N]) and group_zero ([K / group_size, N])
+    """
+    def _quantize(
+        w: torch.Tensor, dtype: str = "fp8"
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if dtype == "fp8":
+            # Start by lowering weights to FP8 and producing row scales.
+            wq, row_scale = quantize_fp8_row(w)
+            # Now reduce to INT4.
+            wq, group_scale = int4_row_quantize(wq, group_size)
+            # Reduce group scale to FP8.
+            group_scale = group_scale.to(torch.float8_e4m3fn)
+            # Take quantized weights and pack them efficiently.
+            wq = pack_int4(wq)
+            # Finally pack weights and scales into efficient preshuffled format.
+            wq, group_scale = torch.ops.fbgemm.preshuffle_i4(wq, group_scale)
+            return wq, (group_scale, row_scale)
+        elif dtype == "bf16":
+            if use_zp:
+                wq, group_scale, group_zero = int4_row_quantize_zp(w, group_size)
+            else:
+                wq, group_scale = int4_row_quantize(w, group_size)
+                group_zero = torch.zeros_like(group_scale)
+            # Set scales to activation type.
+            group_scale = group_scale.to(torch.bfloat16)
+            group_zero = group_zero.to(torch.bfloat16)
+            # Take quantized weights and pack them efficiently.
+            wq = pack_int4(wq)
+            # Finally pack weights and scales into efficient preshuffled format.
+            wq, group_scale = torch.ops.fbgemm.preshuffle_i4(wq, group_scale)
+            return wq, (group_scale, group_zero)
+        else:
+            raise NotImplementedError("Only fp8 and bf16 activations supported.")
+    if w.ndim >= 3:
+        orig_shape = w.shape
+        # Flatten to 3 dimensions then iterate over batches.
+        wq, scales = zip(*[_quantize(i, dtype=dtype) for i in w])
+        wq = torch.stack(wq).view(*orig_shape[:-2], *wq[0].shape)
+        # Decompose then stack scales back into a tuple.
+        a_scales, b_scales = zip(*scales)
+        scales = (
+            torch.stack(a_scales).view(*orig_shape[:-2], *a_scales[0].shape),
+            torch.stack(b_scales).view(*orig_shape[:-2], *b_scales[0].shape),
+        )
+    else:
+        wq, scales = _quantize(w, dtype=dtype)
+    return wq, scales
+def shuffle_slice(
+    x: torch.Tensor, dim: int, start: int, length: int, dtype: str = "fp8"
+) -> torch.Tensor:
+    """
+    Helper function to slice a preshuffled int4 tensor. This is needed since the shuffling
+    reorders rows based on the size of the input. Slicing a tensor shuffled for a larger input
+    is no longer valid. We must reorder the tensor to the appropriate size then slice.
+    Args:
+        x (Tensor): [N, K // 2] Preshuffled int4 tensor.
+        dim (int): Dimension to slice.
+        start (int): Start of slice.
+        length (int): Number of elements to slice in the original [N, K] dimension.
+        dtype (str): Type of corresponding activations. Must be fp8 or bf16.
+    Returns:
+        sliced (Tensor): [stop-start, K // 2] Sliced tensor.
+    """
+    # Get the size of the input tensor.
+    assert dim in [x.ndim - 2, x.ndim - 1], "Only slicing along N or K is supported."
+    assert length % 16 == 0, "Slicing must be a multiple of 16."
+    orig_shape = x.shape
+    N = x.shape[-2]
+    K = x.shape[-1]
+    # Tile shape is based on the activation dtype.
+    assert dtype in ("fp8", "bf16"), "Only fp8 and bf16 activations supported."
+    # Handle slice along M
+    if dim == x.ndim - 2:
+        tile_shape = 8 if dtype == "fp8" else 16
+        block_size = N // length
+        # View the shape in terms of shuffled tiles then permute to allow slicing.
+        x_s = x.view(-1, tile_shape, block_size, length // tile_shape, K)
+        x_s = x_s.permute(0, 2, 1, 3, 4).contiguous().view(-1, N, K)
+        out_slice = x_s.narrow(1, start, length)
+        # Reshape back to original shape.
+        return out_slice.view(*orig_shape[:-2], length, K)
+    # Handle slice along K
+    else:
+        outer_dim = x.view(-1, N, K).shape[0]
+        x_s = x.view(outer_dim, -1, length // 2)
+        row_factor = x_s.shape[1] * (length // 2) // K
+        # Take slices of rows corresponding to column slice.
+        return x_s.narrow(1, start * 2 * K // length, row_factor).view(
+            *orig_shape[:-2], N, length // 2
+        )
+def scale_nvfp4_quant(
+    input: torch.Tensor, input_global_scale: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale.
+    This function quantizes the last dimension of the given tensor `input`. For
+    every 16 consecutive elements, a single dynamically computed scaling factor
+    is shared. This scaling factor is quantized using the `input_global_scale`
+    and is stored in a swizzled layout (see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).
+    Args:
+        input: The input tensor to be quantized to FP4
+        input_global_scale: A scalar scaling factor for the entire tensor.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
+            two values are packed into a uint8 and float8_e4m3 scaling factors
+            in the sizzled layout.
+    """
+    assert input.ndim >= 1, f"input.ndim needs to be >= 1, but got {input.ndim}."
+    other_dims = 1 if input.ndim == 1 else -1
+    input = input.reshape(other_dims, input.shape[-1])
+    m, n = input.shape
+    block_size = 16
+    device = input.device
+    assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}."
+    assert input.dtype in (
+        torch.float16,
+        torch.bfloat16,
+    ), f"input.dtype needs to be fp16 or bf16 but got {input.dtype}."
+    # Two fp4 values will be packed into an uint8.
+    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+    # We use the rounded values to store the swizzled values. Due to the
+    # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
+    # So, we first pad the scales to multiples of 128 and 4. Then, the scales
+    # (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
+    def round_up(x: int, y: int) -> int:
+        return (x + y - 1) // y * y
+    rounded_m = round_up(m, 128)
+    scale_n = n // block_size
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty(
+        (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
+    )
+    torch.ops.fbgemm.scaled_fp4_quant(output, input, output_scale, input_global_scale)
+    output_scale = output_scale.view(torch.float8_e4m3fn)
+    return output, output_scale
+def ck_preshuffle(src: torch.Tensor, NXdl: int = 16) -> torch.Tensor:
+    """
+    Applies shuffling to make weights more efficient for use with CK kernels.
+    Args:
+        src (torch.Tensor): Input tensor with dtype float8_e4m3fnuz.
+        NXdl (int): Wave tile size along N.
+    Returns:
+        torch.Tensor: The shuffled tensor.
+    """
+    # Check input datatype
+    if src.dtype != torch.float8_e4m3fnuz:
+        raise TypeError("Input must be type float8_e4m3fnuz.")
+    N, K = src.shape
+    KPack = 16
+    NLane = NXdl
+    KLane = 64 // NLane
+    K0 = K // (KLane * KPack)
+    # Reshape src to enable the required permutation
+    # Original shape: (N, K)
+    # Desired intermediate shape for permutation: (N0, NLane, K0, KLane, KPack)
+    src = src.reshape(N // NLane, NLane, K0, KLane, KPack)
+    # Apply permutation: (N0, NLane, K0, KLane, KPack) -> (N0, K0, KLane, NLane, KPack)
+    dst = src.permute(0, 2, 3, 1, 4).contiguous()
+    # Reshape to original input shape.
+    dst = dst.reshape(N, K)
+    return dst

fbgemm_gpu/fbgemm.so ADDED Viewed

Binary file

fbgemm_gpu/metrics.py ADDED Viewed

@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from typing import Any, Callable
+import torch
+class BatchAuc(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        sorted_labels = torch.gather(labels, 1, sorted_indices)
+        sorted_weights = torch.gather(weights, 1, sorted_indices)
+        cum_fp = torch.cumsum(sorted_weights * (1.0 - sorted_labels), dim=-1)
+        cum_tp = torch.cumsum(sorted_weights * sorted_labels, dim=-1)
+        fac = cum_fp[:, -1] * cum_tp[:, -1]
+        auc = torch.where(fac == 0, 0.5, torch.trapz(cum_tp, cum_fp, dim=-1) / fac)
+        return auc
+class Auc(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        aucs = []
+        for sorted_indices_i, labels_i, weights_i in zip(
+            sorted_indices, labels, weights
+        ):
+            sorted_labels = torch.index_select(labels_i, dim=0, index=sorted_indices_i)
+            sorted_weights = torch.index_select(
+                weights_i, dim=0, index=sorted_indices_i
+            )
+            cum_fp = torch.cumsum(sorted_weights * (1.0 - sorted_labels), dim=0)
+            cum_tp = torch.cumsum(sorted_weights * sorted_labels, dim=0)
+            auc = torch.where(
+                cum_fp[-1] * cum_tp[-1] == 0,
+                0.5,  # 0.5 is the no-signal default value for auc.
+                torch.trapz(cum_tp, cum_fp) / cum_fp[-1] / cum_tp[-1],
+            )
+            aucs.append(auc.view(1))
+        return torch.cat(aucs)
+class AucJiterator(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # Jiterator only works with elementwise kernels
+        fp_code_string = """
+        template <typename T> T fp(T weights, T labels) {
+            return weights * (1.0 - labels);
+        }"""
+        tp_code_string = """
+        template <typename T> T tp(T weights, T labels) {
+            return weights * labels;
+        }"""
+        # pyre-ignore [4]
+        self.jitted_fp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            fp_code_string
+        )
+        # pyre-ignore [4]
+        self.jitted_tp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            tp_code_string
+        )
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        aucs = []
+        for sorted_indices_i, labels_i, weights_i in zip(
+            sorted_indices, labels, weights
+        ):
+            sorted_labels = torch.index_select(labels_i, dim=0, index=sorted_indices_i)
+            sorted_weights = torch.index_select(
+                weights_i, dim=0, index=sorted_indices_i
+            )
+            cum_fp = torch.cumsum(self.jitted_fp(sorted_weights, sorted_labels), dim=0)
+            cum_tp = torch.cumsum(self.jitted_tp(sorted_weights, sorted_labels), dim=0)
+            auc = torch.where(
+                cum_fp[-1] * cum_tp[-1] == 0,
+                0.5,  # 0.5 is the no-signal default value for auc.
+                torch.trapz(cum_tp, cum_fp) / cum_fp[-1] / cum_tp[-1],
+            )
+            aucs.append(auc.view(1))
+        return torch.cat(aucs)
+class BatchAucJiterator(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # Jiterator only works with elementwise kernels
+        fp_code_string = """
+        template <typename T> T fp(T weights, T labels) {
+            return weights * (1.0 - labels);
+        }"""
+        tp_code_string = """
+        template <typename T> T tp(T weights, T labels) {
+            return weights * labels;
+        }"""
+        # pyre-ignore [4]
+        self.jitted_fp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            fp_code_string
+        )
+        # pyre-ignore [4]
+        self.jitted_tp: Callable[..., Any] = torch.cuda.jiterator._create_jit_fn(
+            tp_code_string
+        )
+    def forward(
+        self,
+        n_tasks: int,
+        predictions: torch.Tensor,
+        labels: torch.Tensor,
+        weights: torch.Tensor,
+    ) -> torch.Tensor:
+        _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+        sorted_labels = torch.gather(labels, 1, sorted_indices)
+        sorted_weights = torch.gather(weights, 1, sorted_indices)
+        cum_fp = torch.cumsum(self.jitted_fp(sorted_weights, sorted_labels), dim=-1)
+        cum_tp = torch.cumsum(self.jitted_tp(sorted_weights, sorted_labels), dim=-1)
+        fac = cum_fp[:, -1] * cum_tp[:, -1]
+        auc = torch.where(fac == 0, 0.5, torch.trapz(cum_tp, cum_fp, dim=-1) / fac)
+        return auc
+def auc(
+    n_tasks: int, predictions: torch.Tensor, labels: torch.Tensor, weights: torch.Tensor
+) -> torch.Tensor:
+    _, sorted_indices = torch.sort(predictions, descending=True, dim=-1)
+    return torch.ops.fbgemm.batch_auc(n_tasks, sorted_indices, labels, weights)

fbgemm_gpu/permute_pooled_embedding_modules.py ADDED Viewed

@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+from itertools import accumulate
+from typing import Optional
+import torch
+from fbgemm_gpu.utils.loader import load_torch_module
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+except Exception:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_cpu"
+    )
+    load_torch_module(
+        "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_gpu"
+    )
+class PermutePooledEmbeddings:
+    """
+    A module for permuting embedding outputs along the feature dimension
+    An embedding output tensor contains the embedding outputs for all features
+    in a batch. It is represented in a 2D format, where the rows are the batch
+    size dimension and the columns are the feature * embedding dimension.
+    Permuting along the feature dimension is essentially permuting along the
+    second dimension (dim 1).
+    **Example:**
+        >>> import torch
+        >>> import fbgemm_gpu
+        >>> from fbgemm_gpu.permute_pooled_embedding_modules import PermutePooledEmbeddings
+        >>>
+        >>> # Suppose batch size = 3 and there are 3 features
+        >>> batch_size = 3
+        >>>
+        >>> # Embedding dimensions for each feature
+        >>> embs_dims = torch.tensor([4, 4, 8], dtype=torch.int64, device="cuda")
+        >>>
+        >>> # Permute list, i.e., move feature 2 to position 0, move feature 0
+        >>> # to position 1, so on
+        >>> permute = [2, 0, 1]
+        >>>
+        >>> # Instantiate the module
+        >>> perm = PermutePooledEmbeddings(embs_dims, permute)
+        >>>
+        >>> # Generate an example input
+        >>> pooled_embs = torch.arange(
+        >>>     embs_dims.sum().item() * batch_size,
+        >>>     dtype=torch.float32, device="cuda"
+        >>> ).reshape(batch_size, -1)
+        >>> print(pooled_embs)
+        >>>
+        tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
+                 14., 15.],
+                [16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29.,
+                 30., 31.],
+                [32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45.,
+                 46., 47.]], device='cuda:0')
+        >>>
+        >>> # Invoke
+        >>> perm(pooled_embs)
+        >>>
+        tensor([[ 8.,  9., 10., 11., 12., 13., 14., 15.,  0.,  1.,  2.,  3.,  4.,  5.,
+                  6.,  7.],
+                [24., 25., 26., 27., 28., 29., 30., 31., 16., 17., 18., 19., 20., 21.,
+                 22., 23.],
+                [40., 41., 42., 43., 44., 45., 46., 47., 32., 33., 34., 35., 36., 37.,
+                 38., 39.]], device='cuda:0')
+    Args:
+        embs_dims (List[int]): A list of embedding dimensions for all features.
+            Length = the number of features
+        permute (List[int]): A list that describes how each feature is
+            permuted. `permute[i]` is to permute feature `permute[i]` to
+            position `i`.
+        device (Optional[torch.device] = None): The device to run this module
+            on
+    """
+    def __init__(
+        self,
+        embs_dims: list[int],
+        permute: list[int],
+        device: Optional[torch.device] = None,
+    ) -> None:
+        self._offset_dim_list: torch.Tensor = torch.tensor(
+            [0] + list(accumulate(embs_dims)), device=device, dtype=torch.int64
+        )
+        self._permute: torch.Tensor = torch.tensor(
+            permute, device=device, dtype=torch.int64
+        )
+        inv_permute: list[int] = [0] * len(permute)
+        for i, p in enumerate(permute):
+            inv_permute[p] = i
+        self._inv_permute: torch.Tensor = torch.tensor(
+            inv_permute, device=device, dtype=torch.int64
+        )
+        inv_embs_dims = [embs_dims[i] for i in permute]
+        self._inv_offset_dim_list: torch.Tensor = torch.tensor(
+            [0] + list(accumulate(inv_embs_dims)), device=device, dtype=torch.int64
+        )
+    def __call__(self, pooled_embs: torch.Tensor) -> torch.Tensor:
+        """
+        Performs pooled embedding output permutation along the feature dimension
+        Args:
+            pooled_embs (Tensor): The embedding outputs to permute. Shape is
+                `(B_local, total_global_D)`, where `B_local` = a local batch
+                size and `total_global_D` is the total embedding dimension
+                across all features (global)
+        Returns:
+            Permuted embedding outputs (Tensor). Same shape as `pooled_embs`
+        """
+        result = torch.ops.fbgemm.permute_pooled_embs_auto_grad(
+            pooled_embs,
+            self._offset_dim_list.to(device=pooled_embs.device),
+            self._permute.to(device=pooled_embs.device),
+            self._inv_offset_dim_list.to(device=pooled_embs.device),
+            self._inv_permute.to(device=pooled_embs.device),
+        )
+        return result

fbgemm_gpu/permute_pooled_embedding_modules_split.py ADDED Viewed

@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import logging
+from itertools import accumulate
+from typing import Optional
+import torch
+from torch import nn
+try:
+    # pyre-ignore[21]
+    from fbgemm_gpu import open_source  # noqa: F401
+except Exception:
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_split_gpu"
+    )
+    torch.ops.load_library(
+        "//deeplearning/fbgemm/fbgemm_gpu:permute_pooled_embedding_ops_split_cpu"
+    )
+@torch.fx.wrap
+def _fx_wrap_tensor_to_device(t: torch.Tensor, device: torch.device) -> torch.Tensor:
+    return t.to(device=device)
+class PermutePooledEmbeddingsSplit(nn.Module):
+    def __init__(
+        self,
+        embs_dims: list[int],
+        permute: list[int],
+        device: Optional[torch.device] = None,
+    ) -> None:
+        super(PermutePooledEmbeddingsSplit, self).__init__()
+        logging.info("Using Permute Pooled Embeddings")
+        self.register_buffer(
+            "_offset_dim_list",
+            torch.tensor(
+                [0] + list(accumulate(embs_dims)), device=device, dtype=torch.int64
+            ),
+        )
+        self.register_buffer(
+            "_permute", torch.tensor(permute, device=device, dtype=torch.int64)
+        )
+        inv_permute: list[int] = [0] * len(permute)
+        for i, p in enumerate(permute):
+            inv_permute[p] = i
+        self.register_buffer(
+            "_inv_permute", torch.tensor(inv_permute, device=device, dtype=torch.int64)
+        )
+        #  `Union[BoundMethod[typing.Callable(torch.Tensor.tolist)[[Named(self,
+        #  torch.Tensor)], List[typing.Any]], torch.Tensor], nn.Module, torch.Tensor]`
+        #  is not a function.
+        inv_embs_dims = [embs_dims[i] for i in permute]
+        self.register_buffer(
+            "_inv_offset_dim_list",
+            torch.tensor(
+                [0] + list(accumulate(inv_embs_dims)), device=device, dtype=torch.int64
+            ),
+        )
+    def forward(self, pooled_embs: torch.Tensor) -> torch.Tensor:
+        result = torch.ops.fbgemm.permute_pooled_embs_auto_grad_split(
+            pooled_embs,
+            _fx_wrap_tensor_to_device(self._offset_dim_list, device=pooled_embs.device),
+            _fx_wrap_tensor_to_device(self._permute, device=pooled_embs.device),
+            _fx_wrap_tensor_to_device(
+                self._inv_offset_dim_list, device=pooled_embs.device
+            ),
+            _fx_wrap_tensor_to_device(self._inv_permute, device=pooled_embs.device),
+        )
+        return result