PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py ADDED Viewed

@@ -0,0 +1,333 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+import torch
+from torch.library import register_fake
+torch.library.define(
+    "blackwell_fmha::fmha_fwd",
+    "(Tensor q, Tensor k, Tensor v, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, Tensor? seqlen_kv, Tensor? page_table, int seqlen_k=-1, int window_size_left=-1, int window_size_right=-1, bool bottom_right=True) -> (Tensor, Tensor)",
+    tags=torch.Tag.pt2_compliant_tag,
+)
+torch.library.define(
+    "blackwell_fmha::fmha_bwd",
+    "(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, int window_size_left=-1, int window_size_right=-1, bool bottom_right=True, bool deterministic=False) -> (Tensor, Tensor, Tensor)",
+    tags=torch.Tag.pt2_compliant_tag,
+)
+@torch.library.impl("blackwell_fmha::fmha_fwd", "cuda")
+def custom_op_fmha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    max_seq_len_q: Optional[int] = None,
+    max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    seqlen_kv: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    seqlen_k: Optional[int] = None,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert q.is_contiguous(), "q is not contiguous"
+    assert k.is_contiguous(), "k is not contiguous"
+    assert v.is_contiguous(), "v is not contiguous"
+    assert q.is_cuda, "q must be on GPU"
+    assert k.is_cuda, "k must be on GPU"
+    assert v.is_cuda, "v must be on GPU"
+    return torch.ops.fbgemm.fmha_fwd(
+        q,
+        k,
+        v,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seq_len_q=max_seq_len_q,
+        max_seq_len_k=max_seq_len_k,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
+    )
+@register_fake("blackwell_fmha::fmha_fwd")
+def fmha_fwd_meta(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    max_seq_len_q: Optional[int] = None,
+    max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    seqlen_kv: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    seqlen_k: Optional[int] = None,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+):
+    if q.dtype == torch.float16:
+        out_dtype = torch.float16
+    elif q.dtype == torch.bfloat16:
+        out_dtype = torch.bfloat16
+    elif q.dtype == torch.float8_e4m3fn:
+        # Output is BF16 when input is FP8
+        out_dtype = torch.bfloat16
+    else:
+        raise RuntimeError(f"Unsupported dtype for q: {q.dtype}")
+    kIsVarlen = max_seq_len_q is not None
+    if kIsVarlen:
+        assert cu_seqlens_q is not None
+        SQ = q.shape[0]
+        H_Q = q.shape[1]
+        B = cu_seqlens_q.shape[0] - 1
+    else:
+        SQ = q.shape[1]
+        H_Q = q.shape[2]
+        B = q.shape[0]
+    device = q.device
+    options2 = {"dtype": torch.float32, "device": device}
+    if kIsVarlen:
+        assert max_seq_len_q is not None
+        out = torch.empty_like(q, dtype=out_dtype)
+        size = out.size()
+        stride = out.stride()
+        storage_offset = q.shape[-1] * max_seq_len_q * H_Q  # example scalar offset
+        out1 = torch.as_strided(
+            out, size=size, stride=stride, storage_offset=storage_offset
+        )
+    else:
+        out1 = torch.empty_like(q, dtype=out_dtype)
+    if kIsVarlen:
+        out2 = torch.empty((1, H_Q, SQ), **options2)  # type: ignore
+    else:
+        out2 = torch.empty((B, H_Q, SQ), **options2)  # type: ignore
+    return out1, out2
+@torch.library.impl("blackwell_fmha::fmha_bwd", "cuda")
+def custom_op_fmha_bwd(
+    dOutput: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    max_seq_len_q: Optional[int] = None,
+    max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+    deterministic: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    return torch.ops.fbgemm.fmha_bwd(
+        dOutput,
+        query,
+        key,
+        value,
+        output,
+        softmax_lse,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seq_len_q=max_seq_len_q,
+        max_seq_len_k=max_seq_len_k,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
+        deterministic=deterministic,
+    )
+@register_fake("blackwell_fmha::fmha_bwd")
+def fmha_bwd_meta(
+    dOutput: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    max_seq_len_q: Optional[int] = None,
+    max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+    deterministic: bool = False,
+):
+    return (
+        torch.empty_like(query),
+        torch.empty_like(key),
+        torch.empty_like(value),
+    )
+def _backward(ctx, *grad):
+    if ctx.is_gen:
+        # For gen case, no backward pass is needed (generation is inference only)
+        raise RuntimeError("Backward pass is not supported for generation phase (sq=1)")
+    q, k, v, out, softmax_lse = ctx.saved_tensors
+    if not grad[0].is_contiguous():
+        grad0 = grad[0].contiguous()
+    else:
+        grad0 = grad[0]
+    if not softmax_lse.is_contiguous:
+        softmax_lse = softmax_lse.contiguous()
+    if not out.is_contiguous:
+        out = out.contiguous()
+    if not q.is_contiguous:
+        q = q.contiguous()
+    if not k.is_contiguous:
+        k = k.contiguous()
+    if not softmax_lse.is_contiguous:
+        softmax_lse = softmax_lse.contiguous()
+    if not out.is_contiguous:
+        out = out.contiguous()
+    if not q.is_contiguous:
+        q = q.contiguous()
+    if not k.is_contiguous:
+        k = k.contiguous()
+    dq, dk, dv = torch.ops.blackwell_fmha.fmha_bwd(
+        grad0,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        ctx.cu_seqlens_q,
+        ctx.cu_seqlens_k,
+        ctx.max_seq_len_q,
+        ctx.max_seq_len_k,
+        ctx.softmax_scale,
+        ctx.causal,
+        ctx.window_size_left,
+        ctx.window_size_right,
+        ctx.bottom_right,
+        ctx.deterministic,
+    )
+    return (
+        dq,
+        dk,
+        dv,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    )
+def _setup_context(ctx, inputs, output):
+    (
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seq_len_q,
+        max_seq_len_k,
+        softmax_scale,
+        causal,
+        seqlen_kv,
+        page_table,
+        seqlen_k,
+        window_size_left,
+        window_size_right,
+        bottom_right,
+    ) = inputs
+    (out, softmax_lse) = output
+    ctx.save_for_backward(q, k, v, out, softmax_lse)
+    ctx.softmax_scale = softmax_scale
+    ctx.causal = causal
+    ctx.max_seq_len_q = max_seq_len_q
+    ctx.max_seq_len_k = max_seq_len_k
+    ctx.cu_seqlens_q = cu_seqlens_q
+    ctx.cu_seqlens_k = cu_seqlens_k
+    ctx.window_size_left = window_size_left
+    ctx.window_size_right = window_size_right
+    ctx.bottom_right = bottom_right
+    ctx.deterministic = False  # Set default value
+    ctx.is_gen = False
+# This code adds training support for the operator. You must provide us
+# the backward formula for the operator and a `setup_context` function
+# to save values to be used in the backward.
+torch.library.register_autograd(
+    "blackwell_fmha::fmha_fwd", _backward, setup_context=_setup_context
+)
+def cutlass_blackwell_fmha_custom_op(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    softmax_scale: float | None = None,
+    causal: bool = False,
+    cu_seqlens_q: torch.Tensor | None = None,
+    cu_seqlens_k: torch.Tensor | None = None,
+    max_seq_len_q: int | None = None,
+    max_seq_len_k: int | None = None,
+    seqlen_kv: torch.Tensor | None = None,
+    page_table: torch.Tensor | None = None,
+    seqlen_k: int | None = -1,
+    window_size_left: int | None = -1,
+    window_size_right: int | None = -1,
+    bottom_right: bool | None = True,
+):
+    return torch.ops.blackwell_fmha.fmha_fwd(
+        q=q,
+        k=k,
+        v=v,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seq_len_q=max_seq_len_q,
+        max_seq_len_k=max_seq_len_k,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
+    )[0]