PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show

fbgemm_gpu/__init__.py +186 -0
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
fbgemm_gpu/config/__init__.py +9 -0
fbgemm_gpu/config/feature_list.py +88 -0
fbgemm_gpu/docs/__init__.py +18 -0
fbgemm_gpu/docs/common.py +9 -0
fbgemm_gpu/docs/examples.py +73 -0
fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
fbgemm_gpu/docs/quantize_ops.py +41 -0
fbgemm_gpu/docs/sparse_ops.py +616 -0
fbgemm_gpu/docs/target.genai.json.py +6 -0
fbgemm_gpu/enums.py +24 -0
fbgemm_gpu/experimental/example/__init__.py +29 -0
fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
fbgemm_gpu/experimental/example/utils.py +20 -0
fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/metrics.py +160 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
fbgemm_gpu/quantize/__init__.py +43 -0
fbgemm_gpu/quantize/quantize_ops.py +64 -0
fbgemm_gpu/quantize_comm.py +315 -0
fbgemm_gpu/quantize_utils.py +246 -0
fbgemm_gpu/runtime_monitor.py +237 -0
fbgemm_gpu/sll/__init__.py +189 -0
fbgemm_gpu/sll/cpu/__init__.py +80 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
fbgemm_gpu/sll/meta/__init__.py +35 -0
fbgemm_gpu/sll/meta/meta_sll.py +337 -0
fbgemm_gpu/sll/triton/__init__.py +127 -0
fbgemm_gpu/sll/triton/common.py +38 -0
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
fbgemm_gpu/sparse_ops.py +1455 -0
fbgemm_gpu/split_embedding_configs.py +452 -0
fbgemm_gpu/split_embedding_inference_converter.py +175 -0
fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
fbgemm_gpu/split_embedding_utils.py +29 -0
fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
fbgemm_gpu/tbe/__init__.py +6 -0
fbgemm_gpu/tbe/bench/__init__.py +55 -0
fbgemm_gpu/tbe/bench/bench_config.py +156 -0
fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
fbgemm_gpu/tbe/bench/reporter.py +35 -0
fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
fbgemm_gpu/tbe/bench/utils.py +48 -0
fbgemm_gpu/tbe/cache/__init__.py +11 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
fbgemm_gpu/tbe/ssd/__init__.py +15 -0
fbgemm_gpu/tbe/ssd/common.py +46 -0
fbgemm_gpu/tbe/ssd/inference.py +586 -0
fbgemm_gpu/tbe/ssd/training.py +4908 -0
fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
fbgemm_gpu/tbe/utils/__init__.py +13 -0
fbgemm_gpu/tbe/utils/common.py +42 -0
fbgemm_gpu/tbe/utils/offsets.py +65 -0
fbgemm_gpu/tbe/utils/quantize.py +251 -0
fbgemm_gpu/tbe/utils/requests.py +556 -0
fbgemm_gpu/tbe_input_multiplexer.py +108 -0
fbgemm_gpu/triton/__init__.py +22 -0
fbgemm_gpu/triton/common.py +77 -0
fbgemm_gpu/triton/jagged/__init__.py +8 -0
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
fbgemm_gpu/triton/quantize.py +647 -0
fbgemm_gpu/triton/quantize_ref.py +286 -0
fbgemm_gpu/utils/__init__.py +11 -0
fbgemm_gpu/utils/filestore.py +211 -0
fbgemm_gpu/utils/loader.py +36 -0
fbgemm_gpu/utils/torch_library.py +132 -0
fbgemm_gpu/uvm.py +40 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
list_versions/__init__.py +12 -0
list_versions/cli_run.py +163 -0

fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py ADDED Viewed

@@ -0,0 +1,221 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import torch
+import triton
+import triton.language as tl
+from .common import expect_contiguous
+@triton.jit
+def jagged2_to_padded_dense_kernel(
+    x_ptr,
+    lengths_ptr,
+    offsets_ptr,
+    output_dense_ptr,
+    stride_b,
+    stride_m,
+    stride_n,
+    max_length,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_batch = tl.program_id(2)
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+    begin = tl.load(offsets_ptr + pid_batch)
+    seqlen = tl.load(lengths_ptr + pid_batch)
+    seqlen = tl.minimum(seqlen, max_length)
+    if seqlen == 0:
+        return
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    x_ptrs = x_ptr + begin + offs_m[:, None] * seqlen + offs_n[None, :]
+    x = tl.load(x_ptrs, mask=((offs_m[:, None] < seqlen) & (offs_n[None, :] < seqlen)))
+    out_ptrs = (
+        output_dense_ptr
+        + pid_batch * stride_b
+        + offs_m[:, None] * stride_m
+        + offs_n[None, :] * stride_n
+    )
+    tl.store(
+        out_ptrs, x, mask=((offs_m[:, None] < seqlen) & (offs_n[None, :] < seqlen))
+    )
+@triton.jit
+def padded_dense_to_jagged2_kernel(
+    x_ptr,
+    lengths_ptr,
+    offsets_ptr,
+    output_jagged_ptr,
+    stride_b,
+    stride_m,
+    stride_n,
+    max_length,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_batch = tl.program_id(2)
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+    begin = tl.load(offsets_ptr + pid_batch)
+    # end = tl.load(offsets_ptr + pid_batch + 1)
+    seqlen = tl.load(lengths_ptr + pid_batch)
+    seqlen = tl.minimum(seqlen, max_length)
+    if seqlen == 0:
+        return
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    x_ptrs = (
+        x_ptr
+        + pid_batch * stride_b
+        + offs_m[:, None] * stride_m
+        + offs_n[None, :] * stride_n
+    )
+    x = tl.load(x_ptrs, mask=((offs_m[:, None] < seqlen) & (offs_n[None, :] < seqlen)))
+    out_ptrs = output_jagged_ptr + begin + offs_m[:, None] * seqlen + offs_n[None, :]
+    tl.store(
+        out_ptrs, x, mask=((offs_m[:, None] < seqlen) & (offs_n[None, :] < seqlen))
+    )
+def jagged2_to_padded_dense_fwd(
+    values: torch.Tensor,
+    lengths: torch.Tensor,
+    offsets: torch.Tensor,
+    max_length: int,
+    padding_value: float,
+) -> torch.Tensor:
+    B = offsets.size(0) - 1
+    output_dense = torch.full(
+        (B, max_length, max_length),
+        padding_value,
+        dtype=values.dtype,
+        device=values.device,
+    )
+    BLOCK_M = 32
+    BLOCK_N = 32
+    num_blocks_m = triton.cdiv(max_length, BLOCK_M)
+    num_blocks_n = triton.cdiv(max_length, BLOCK_N)
+    grid = (num_blocks_m, num_blocks_n, B)
+    jagged2_to_padded_dense_kernel[grid](
+        values,
+        lengths,
+        offsets,
+        output_dense,
+        output_dense.stride(0),
+        output_dense.stride(1),
+        output_dense.stride(2),
+        max_length,
+        # pyre-fixme[6]: Incompatible parameter type [6]: expected `constexpr` but got `int`.
+        BLOCK_M,
+        # pyre-fixme[6]: Incompatible parameter type [6]: expected `constexpr` but got `int`.
+        BLOCK_N,
+    )
+    return output_dense
+def padded_dense_to_jagged2_fwd(
+    values: torch.Tensor,
+    lengths: torch.Tensor,
+    offsets: torch.Tensor,
+    max_length: int,
+) -> torch.Tensor:
+    B = values.size(0)
+    output_jagged = torch.empty(
+        int(offsets[-1]), dtype=values.dtype, device=values.device
+    )
+    BLOCK_M = 32
+    BLOCK_N = 32
+    num_blocks_m = triton.cdiv(max_length, BLOCK_M)
+    num_blocks_n = triton.cdiv(max_length, BLOCK_N)
+    grid = (num_blocks_m, num_blocks_n, B)
+    padded_dense_to_jagged2_kernel[grid](
+        values,
+        lengths,
+        offsets,
+        output_jagged,
+        values.stride(0),
+        values.stride(1),
+        values.stride(2),
+        max_length,
+        # pyre-fixme[6]: Incompatible parameter type [6]: expected `constexpr` but got `int`.
+        BLOCK_M,
+        # pyre-fixme[6]: Incompatible parameter type [6]: expected `constexpr` but got `int`.
+        BLOCK_N,
+    )
+    return output_jagged
+class Jagged2ToPaddedDense(torch.autograd.Function):
+    @staticmethod
+    # pyre-fixme
+    def forward(
+        ctx,
+        values: torch.Tensor,
+        offsets: torch.Tensor,
+        max_length: int,
+        padding_value: float,
+    ) -> torch.Tensor:
+        lengths_square = offsets[1:] - offsets[0:-1:1]
+        lengths = torch.sqrt(lengths_square).to(torch.int32)
+        ctx.max_length = max_length
+        ctx.save_for_backward(lengths, offsets)
+        output = jagged2_to_padded_dense_fwd(
+            values, lengths, offsets, max_length, padding_value
+        )
+        return output
+    @staticmethod
+    # pyre-fixme
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> tuple[torch.Tensor, None, None, None]:
+        max_length = ctx.max_length
+        (lengths, offsets) = ctx.saved_tensors
+        grad_in = padded_dense_to_jagged2_fwd(grad_output, lengths, offsets, max_length)
+        return (grad_in, None, None, None)
+def jagged2_to_padded_dense(
+    values: torch.Tensor,
+    offsets: torch.Tensor,
+    max_length: int,
+    padding_value: float = 0.0,
+) -> torch.Tensor:
+    """
+    values: jagged tensor with size [sum(Ni * Ni)]
+    offsets: offsets for jagged tensor, with size [B + 1]
+    max_length: maximum sequence length in the batch
+    padding_value: value to use for padding
+    return padded dense tensor of size [B, N, N]
+    """
+    values = expect_contiguous(values)
+    offsets = expect_contiguous(offsets)
+    return Jagged2ToPaddedDense.apply(values, offsets, max_length, padding_value)

fbgemm_gpu/sll/triton/triton_jagged_bmm.py ADDED Viewed

@@ -0,0 +1,418 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import torch
+import triton
+import triton.language as tl
+def set_block_size(N: int) -> int:
+    if N > 64:
+        return 64
+    elif N > 16:
+        return 32
+    else:
+        return 16
+# TODO add autotune to find best block size
+# add supergroup to optimize GPU cache
+@triton.jit
+def jagged_dense_bmm_kernel(
+    a_ptr,
+    a_offset_ptr,
+    b_ptr,
+    c_ptr,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bl,  # batch idx
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    max_seq_len,  # max sequence length for jaggged tensor
+    allow_tf32: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    """Kernel for computing the matmul C = A x B.
+    A has shape (sum_B(M_i), K), B has shape (B, K, N) and C has shape (sum_B(M_i), N)
+    """
+    pid_batch = tl.program_id(0)
+    pid = tl.program_id(1)
+    # a_offset_ptr has stride of 1
+    # row_start for jagged tensor
+    begin = tl.load(a_offset_ptr + pid_batch)
+    end = tl.load(a_offset_ptr + pid_batch + 1)
+    M = tl.minimum(end - begin, max_seq_len)  # in case M > max seq len
+    if M == 0:
+        return
+    # num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    # if pid_m * BLOCK_SIZE_M >=M, then this block doesn't need to be computed
+    if pid_m * BLOCK_SIZE_M >= M:
+        return
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    if pid_n * BLOCK_SIZE_N >= N:
+        return
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak + begin * stride_am
+    )  # jagged tensor ptr
+    b_ptrs = b_ptr + (
+        offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+        + pid_batch * stride_bl
+    )  # dense tensor ptr
+    c = tl.zeros(
+        (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32
+    )  # TODO, max this flexible
+    # Compute c[m, n] for 1 example of the batch
+    for k in range(0, K, BLOCK_SIZE_K):
+        updated_offset = k + offs_k
+        a = tl.load(
+            a_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=(updated_offset[None, :] < K) & (offs_am[:, None] < M),
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=(updated_offset[:, None] < K) & (offs_bn[None, :] < N),
+            other=0.0,
+        )
+        c += tl.dot(a, b, allow_tf32=allow_tf32)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    c_ptrs = (
+        c_ptr
+        + stride_cm * offs_m[:, None]
+        + stride_cn * offs_n[None, :]
+        + begin * stride_cm
+    )
+    tl.store(c_ptrs, c, mask=mask)
+@triton.jit
+def jagged_jagged_bmm_kernel(
+    a_ptr,
+    a_offset_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cl,
+    stride_cm,
+    stride_cn,
+    max_seq_len,
+    allow_tf32: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    """
+    Kernel for computing the matmul C = A x B.
+    A has shape (M, sum_B(Ki)), B has shape (sum_B(Ki), N) and C has shape (B, M, N)
+    """
+    pid_batch = tl.program_id(0)
+    pid = tl.program_id(1)
+    # need to make sure a_offset_ptr has stride of 1
+    begin = tl.load(a_offset_ptr + pid_batch)
+    end = tl.load(a_offset_ptr + pid_batch + 1)
+    K = end - begin  # K for current pid_batch
+    K = tl.minimum(K, max_seq_len)
+    # if K == 0:
+    #     return
+    # calculate pid_m and pid_n
+    # num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = (
+        a_ptr
+        + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        + begin * stride_ak
+    )
+    b_ptrs = (
+        b_ptr
+        + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+        + begin * stride_bk
+    )
+    c = tl.zeros(
+        (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32
+    )  # TODO, max this flexible
+    for k in range(0, K, BLOCK_SIZE_K):
+        updated_offset = k + offs_k
+        a = tl.load(
+            a_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=((updated_offset[None, :] < K) & (offs_am[:, None] < M)),
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=((updated_offset[:, None] < K) & (offs_bn[None, :] < N)),
+            other=0.0,
+        )
+        c += tl.dot(a, b, allow_tf32=allow_tf32)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    c_ptrs = (
+        c_ptr
+        + stride_cm * offs_m[:, None]
+        + stride_cn * offs_n[None, :]
+        + stride_cl * pid_batch
+    )
+    tl.store(c_ptrs, c, mask=mask)
+def triton_jagged_dense_bmm(a, b, a_offsets, max_seq_len, allow_tf32):
+    # checks constraints
+    assert a.shape[1] == b.shape[1], "incompatible dimensions"
+    assert a_offsets.is_contiguous(), "A offsets mush be contiguous"
+    sum_B, K = a.shape
+    B, K, N = b.shape
+    # Use zeros instead of empty to handle corner case when jagged tensor has length > max seq len
+    # In that case, it is possible that the output is inconsistent with the padded version if empty is used
+    c = a.new_zeros((sum_B, N))
+    BLOCK_SIZE_M = 32 if max_seq_len < 50 else 64
+    BLOCK_SIZE_N = set_block_size(N)
+    BLOCK_SIZE_K = set_block_size(K)
+    # 2D launch kernel where each block gets its own program.
+    # TODO, is this the best way to handle launch grid?
+    # The grid number on M axises is larger than required often due to max_seq_len
+    grid = (
+        B,
+        triton.cdiv(max_seq_len, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N),
+    )
+    jagged_dense_bmm_kernel[grid](
+        a,
+        a_offsets,
+        b,
+        c,
+        N,
+        K,
+        a.stride(0),
+        a.stride(1),
+        b.stride(0),
+        b.stride(1),
+        b.stride(2),
+        c.stride(0),
+        c.stride(1),
+        max_seq_len,
+        allow_tf32,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        BLOCK_SIZE_K,
+    )
+    return c
+def triton_jagged_jagged_bmm(a, b, a_offsets, max_seq_len, allow_tf32):
+    # checks constraints
+    assert a.shape[1] == b.shape[0], "incompatible dimensions"
+    assert a_offsets.is_contiguous(), "A offsets mush be contiguous"
+    M, _ = a.shape
+    _, N = b.shape
+    B = a_offsets.size(0) - 1
+    # allocates output
+    c = torch.empty((B, M, N), device=a.device, dtype=a.dtype)
+    # 2D launch kernel where each block gets its own program.
+    BLOCK_SIZE_M = set_block_size(M)
+    BLOCK_SIZE_N = set_block_size(N)
+    BLOCK_SIZE_K = 32
+    grid = (
+        B,
+        triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N),
+    )
+    jagged_jagged_bmm_kernel[grid](
+        a,
+        a_offsets,
+        b,
+        c,
+        M,
+        N,
+        a.stride(0),
+        a.stride(1),
+        b.stride(0),
+        b.stride(1),
+        c.stride(0),
+        c.stride(1),
+        c.stride(2),
+        max_seq_len,
+        allow_tf32,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        BLOCK_SIZE_K,
+    )
+    return c
+class JaggedDenseBmm(torch.autograd.Function):
+    """
+    Compute batch matrix multiplication between JaggedTensor and dense tensor
+    dense: [B, N, D] * [B, D, T] = [B, N, T]
+    jagged: [Sum_B, D] * [B, D, T] = [Sum_B, T]
+    """
+    @staticmethod
+    # pyre-fixme
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        y: torch.Tensor,
+        x_offsets: torch.Tensor,
+        N: int,
+        allow_tf32: bool,
+    ):
+        ctx.save_for_backward(x, y, x_offsets)
+        ctx.N = N
+        ctx.allow_tf32 = allow_tf32
+        return triton_jagged_dense_bmm(x, y, x_offsets, N, allow_tf32=allow_tf32)
+    @staticmethod
+    # pyre-fixme
+    def backward(ctx, grad_output: torch.Tensor):
+        """
+        # X = [Sum_B, D]
+        # Y = [B, D, T]
+        # Z = X * Y = [Sum_B, T]
+        # dX = dZ * YT # [Sum_B, T] * [B, T, D] = [Sum_B, D]
+        # dY = XT * dZ # [D, sum_B] * [sum_B, T] = [D, B, T]
+        """
+        # logging.info(f"Jagged bmm backward called")
+        (x, y, x_offsets) = ctx.saved_tensors
+        N = ctx.N
+        grad_x = triton_jagged_dense_bmm(
+            grad_output, y.permute(0, 2, 1), x_offsets, N, allow_tf32=ctx.allow_tf32
+        )
+        grad_y = triton_jagged_jagged_bmm(
+            x.T, grad_output, x_offsets, N, allow_tf32=ctx.allow_tf32
+        )
+        return grad_x, grad_y, None, None, None
+class JaggedJaggedBmm(torch.autograd.Function):
+    """
+    Compute batch matrix multiplication between JaggedTensor and Jagged Tensor
+    dense: [B, D, N] * [B, N, T] = [B, D, T]
+    jagged: [Sum_B, D].T * [Sum_B, T] = [B, D, T]
+    """
+    @staticmethod
+    # pyre-fixme
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        y: torch.Tensor,
+        x_offsets: torch.Tensor,
+        N: int,
+        allow_tf32,
+    ):
+        ctx.save_for_backward(x, y, x_offsets)
+        ctx.N = N
+        ctx.allow_tf32 = allow_tf32
+        return triton_jagged_jagged_bmm(x.T, y, x_offsets, N, allow_tf32=allow_tf32)
+    @staticmethod
+    # pyre-fixme
+    def backward(ctx, grad_output: torch.Tensor):
+        """
+        # X = [Sum_B, D]
+        # Y = [Sum_B, T]
+        # Z = XT * Y = [B, D, T]
+        # dXT = dZ * YT -> dX = Y * dZT
+        # dY = X * dZ -> X * dZ
+        """
+        (x, y, offsets) = ctx.saved_tensors
+        N = ctx.N
+        grad_x = triton_jagged_dense_bmm(
+            y, grad_output.permute(0, 2, 1), offsets, N, allow_tf32=ctx.allow_tf32
+        )
+        grad_y = triton_jagged_dense_bmm(
+            x, grad_output, offsets, N, allow_tf32=ctx.allow_tf32
+        )
+        return grad_x, grad_y, None, None, None
+def jagged_dense_bmm(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    x_offsets: torch.Tensor,
+    N: int,
+    allow_tf32: bool,
+    use_fbgemm_kernel: bool = True,
+) -> torch.Tensor:
+    """
+    Compute batch matrix multiplication between JaggedTensor and Jagged Tensor
+    dense: [B, D, N] * [B, N, T] = [B, D, T]
+    jagged: [D, Sum_B] * [Sum_B, T] = [B, D, T]
+    """
+    if use_fbgemm_kernel:
+        return torch.ops.fbgemm.jagged_dense_bmm(x, x_offsets, y, N)[0]
+    else:
+        return JaggedDenseBmm.apply(x, y, x_offsets, N, allow_tf32)
+def jagged_jagged_bmm(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    x_offsets: torch.Tensor,
+    N: int,
+    allow_tf32: bool,
+    use_fbgemm_kernel: bool = True,
+):
+    """
+    Compute batch matrix multiplication between JaggedTensor and Jagged Tensor
+    dense: [B, D, N] * [B, N, T] = [B, D, T]
+    jagged: [Sum_B, D].T * [Sum_B, T] = [B, D, T]
+    """
+    if use_fbgemm_kernel:
+        return torch.ops.fbgemm.jagged_jagged_bmm(x, y, x_offsets, N)
+    else:
+        return JaggedJaggedBmm.apply(x, y, x_offsets, N, allow_tf32)