PyPI - fbgemm-gpu-nightly-cpu - Versions diffs - 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl - Mend

fbgemm-gpu-nightly-cpu 2025.3.27__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

fbgemm_gpu/__init__.py +118 -23
fbgemm_gpu/asmjit.so +0 -0
fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
fbgemm_gpu/config/feature_list.py +7 -1
fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
fbgemm_gpu/docs/sparse_ops.py +142 -1
fbgemm_gpu/docs/target.default.json.py +6 -0
fbgemm_gpu/enums.py +3 -4
fbgemm_gpu/fbgemm.so +0 -0
fbgemm_gpu/fbgemm_gpu_config.so +0 -0
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
fbgemm_gpu/fbgemm_gpu_py.so +0 -0
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
fbgemm_gpu/quantize/__init__.py +2 -0
fbgemm_gpu/quantize/quantize_ops.py +1 -0
fbgemm_gpu/quantize_comm.py +29 -12
fbgemm_gpu/quantize_utils.py +88 -8
fbgemm_gpu/runtime_monitor.py +9 -5
fbgemm_gpu/sll/__init__.py +3 -0
fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
fbgemm_gpu/sll/triton/__init__.py +0 -10
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
fbgemm_gpu/sparse_ops.py +244 -76
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +26 -0
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +208 -105
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +261 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +9 -58
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +10 -59
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +211 -36
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +195 -26
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +225 -41
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +216 -111
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +221 -37
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +259 -53
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +192 -96
fbgemm_gpu/split_embedding_configs.py +287 -3
fbgemm_gpu/split_embedding_inference_converter.py +7 -6
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py +2 -0
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py +2 -0
fbgemm_gpu/split_table_batched_embeddings_ops_common.py +275 -9
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +44 -37
fbgemm_gpu/split_table_batched_embeddings_ops_training.py +900 -126
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
fbgemm_gpu/tbe/bench/__init__.py +13 -2
fbgemm_gpu/tbe/bench/bench_config.py +37 -9
fbgemm_gpu/tbe/bench/bench_runs.py +301 -12
fbgemm_gpu/tbe/bench/benchmark_click_interface.py +189 -0
fbgemm_gpu/tbe/bench/eeg_cli.py +138 -0
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +4 -5
fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
fbgemm_gpu/tbe/bench/tbe_data_config.py +116 -198
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +158 -32
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +16 -8
fbgemm_gpu/tbe/bench/utils.py +129 -5
fbgemm_gpu/tbe/cache/__init__.py +1 -0
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -5
fbgemm_gpu/tbe/ssd/common.py +27 -0
fbgemm_gpu/tbe/ssd/inference.py +15 -15
fbgemm_gpu/tbe/ssd/training.py +2930 -195
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +34 -3
fbgemm_gpu/tbe/stats/__init__.py +10 -0
fbgemm_gpu/tbe/stats/bench_params_reporter.py +349 -0
fbgemm_gpu/tbe/utils/offsets.py +6 -6
fbgemm_gpu/tbe/utils/quantize.py +8 -8
fbgemm_gpu/tbe/utils/requests.py +53 -28
fbgemm_gpu/tbe_input_multiplexer.py +16 -7
fbgemm_gpu/triton/common.py +0 -1
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
fbgemm_gpu/triton/quantize.py +14 -9
fbgemm_gpu/utils/filestore.py +56 -5
fbgemm_gpu/utils/torch_library.py +2 -2
fbgemm_gpu/utils/writeback_util.py +124 -0
fbgemm_gpu/uvm.py +3 -0
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +3 -6
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -3
list_versions/cli_run.py +161 -0
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/RECORD +0 -126
fbgemm_gpu_nightly_cpu-2025.3.27.dist-info/top_level.txt +0 -1
{fbgemm_gpu_nightly_cpu-2025.3.27.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0

fbgemm_gpu/quantize_comm.py CHANGED Viewed

@@ -13,10 +13,11 @@
 import logging
-from typing import List, Optional, Tuple, TypeVar
+from typing import Optional, TypeVar
 import torch
+# fmt:skip
 from fbgemm_gpu.quantize_utils import (
     bf16_to_fp32,
     fp16_to_fp32,
@@ -25,12 +26,10 @@ from fbgemm_gpu.quantize_utils import (
     fp32_to_hfp8_with_clamp,
     fp32_to_mx4,
     hfp8_to_fp32,
-    mx4_to_fp32,
+    mx4_to_float,
     RoundingMode,
 )
 from fbgemm_gpu.split_embedding_configs import SparseType
 from torch.autograd.profiler import record_function  # usort:skip
 from dataclasses import dataclass
@@ -66,8 +65,8 @@ class QuantizationContext:
     row_dim: int = ROW_DIM_DEFAULT
     row_dim_quant: int = -1
     mx_group_size: int = MX_GROUP_SIZE_DEFAULT
-    rounding_mode: RoundingMode = RoundingMode.even
-    padded_dim_sum_per_rank: Optional[List[int]] = None
+    rounding_mode: Optional[RoundingMode] = RoundingMode.even
+    padded_dim_sum_per_rank: Optional[list[int]] = None
 def _quantize_tensor(
@@ -123,6 +122,7 @@ def _dequantize_tensor(
     comm_precision: SparseType,
     ctx: Optional[QuantizationContext] = None,
     is_fwd: bool = True,
+    output_dtype: Optional[SparseType] = None,
 ) -> torch.Tensor:
     if comm_precision == SparseType.FP32:
         assert quantized_tensor.dtype == torch.float
@@ -137,8 +137,12 @@ def _dequantize_tensor(
         if ctx is not None and ctx.row_dim > 0:
             row_dim_quant = ctx.row_dim_quant
             quantized_tensor_2d = quantized_tensor.view((-1, row_dim_quant))
+            # use provided output_dtype or default to FP32 (0)
+            output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
             dequant_tensor = torch.ops.fbgemm.FP8RowwiseQuantizedToFloat(
-                quantized_tensor_2d, is_fwd
+                quantized_tensor_2d,
+                is_fwd,
+                output_dtype_int,
             )
             return dequant_tensor.view(-1)
         else:
@@ -154,7 +158,7 @@ def _dequantize_tensor(
         return dequant_tensor.view(-1)
     elif comm_precision == SparseType.MX4:
         mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
-        return mx4_to_fp32(quantized_tensor, mx_group_size)
+        return mx4_to_float(quantized_tensor, mx_group_size, output_dtype=output_dtype)
     else:
         raise ValueError(f"comm_precision={comm_precision} is not supported")
@@ -167,6 +171,8 @@ class QuantizedCommCodec:
         loss_scale: Optional[float] = None,
         row_dim: Optional[int] = None,
         is_fwd: bool = True,
+        rounding_mode: Optional[RoundingMode] = None,
+        output_dtype: Optional[SparseType] = None,
     ) -> None:
         if loss_scale is not None:
             if comm_precision not in [SparseType.FP16, SparseType.BF16]:
@@ -183,8 +189,13 @@ class QuantizedCommCodec:
         self._loss_scale = loss_scale
         self._is_fwd = is_fwd
         self._row_dim: int = -1 if row_dim is None else row_dim
+        self._rounding_mode: Optional[RoundingMode] = rounding_mode
+        self._output_dtype: Optional[SparseType] = output_dtype
         if self._comm_precision == SparseType.MX4:
             self._row_dim = MX_GROUP_SIZE_DEFAULT if row_dim is None else row_dim
+            self._rounding_mode = (
+                RoundingMode.even if rounding_mode is None else rounding_mode
+            )
     def encode(
         self, input_tensor: torch.Tensor, ctx: Optional[QuantizationContext] = None
@@ -211,7 +222,11 @@ class QuantizedCommCodec:
             f"## decoder {self._comm_precision} {self._loss_scale} ##"
         ):
             dequantized_tensor = _dequantize_tensor(
-                input_tensor, self._comm_precision, ctx, self._is_fwd
+                input_tensor,
+                self._comm_precision,
+                ctx,
+                self._is_fwd,
+                output_dtype=self._output_dtype,
             )
         return dequantized_tensor
@@ -258,7 +273,9 @@ class QuantizedCommCodec:
             return QuantizationContext(self._row_dim)
         if self._comm_precision == SparseType.MX4:
             return QuantizationContext(
-                row_dim=self._row_dim, mx_group_size=self._row_dim
+                row_dim=self._row_dim,
+                mx_group_size=self._row_dim,
+                rounding_mode=self._rounding_mode,
             )
         # int8 rowwise is default
         return QuantizationContext()
@@ -266,10 +283,10 @@ class QuantizedCommCodec:
     def padded_size(
         self,
         input_tensor: torch.Tensor,
-        dim_per_rank: List[int],
+        dim_per_rank: list[int],
         my_rank: int,
         qcomm_ctx: QuantizationContext,
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         if input_tensor.ndim == 1:
             return input_tensor.shape[0], 0
         # return padded size for the feature dimension (dim 1), 0 if no padding needed.

fbgemm_gpu/quantize_utils.py CHANGED Viewed

@@ -10,11 +10,34 @@
 import logging
 from typing import Optional, Union
-import torch
+import torch  # isort:skip
-from fbgemm_gpu.triton import dequantize_mx4, quantize_mx4, RoundingMode
+import fbgemm_gpu
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.triton.common import RoundingMode
 from fbgemm_gpu.triton.quantize_ref import py_dequantize_mx4, py_quantize_mx4
+try:
+    if torch.cuda.is_available():
+        from fbgemm_gpu.triton import quantize_mx4
+        from fbgemm_gpu.triton.quantize import triton_dequantize_mx4
+except Exception:
+    pass
+try:
+    # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
+    open_source = bool(getattr(fbgemm_gpu, "open_source", False))
+except NotImplementedError:
+    open_source = False
+# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
+if not open_source:
+    from mtia.kernels.triton.mx4.quantize import (
+        triton_dequantize_mx4 as mtia_dequantize_mx4,
+        triton_quantize_mx4 as mtia_quantize_mx4,
+    )
 logger: logging.Logger = logging.getLogger()
 try:
@@ -60,7 +83,7 @@ def fp32_to_mx4(
     if rounding_mode is None:
         rounding_mode = RoundingMode.even
-    if not tensor.is_cuda:
+    if not tensor.is_cuda and not tensor.is_mtia:
         return py_quantize_mx4(
             tensor,
             group_size,
@@ -71,6 +94,15 @@ def fp32_to_mx4(
         )
     if use_triton:
+        if tensor.is_mtia:
+            return mtia_quantize_mx4(
+                tensor,
+                group_size,
+                ebits=ebits,
+                mbits=mbits,
+                rounding_mode=rounding_mode,
+                stochastic_casting=stochastic_casting,
+            )
         return quantize_mx4(
             tensor,
             group_size,
@@ -102,23 +134,71 @@ def mx4_to_fp32(
 ) -> torch.Tensor:
     """Dequantize an MX4 tensor to FP32 with triton or native cuda impl.
+    This function is kept for backward compatibility and always returns FP32.
+    For BF16 output, use mx4_to_float() with output_dtype=SparseType.BF16.
+    """
+    return mx4_to_float(
+        tensor,
+        group_size,
+        use_triton,
+        ebits,
+        mbits,
+        output_dtype=None,  # None = FP32 default for backward compatibility
+    )
+def mx4_to_float(
+    tensor: torch.Tensor,
+    group_size: int = 32,
+    use_triton: bool = True,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: Optional[SparseType] = None,
+) -> torch.Tensor:
+    """Dequantize an MX4 tensor to FP32 or BF16 with triton or native cuda impl.
     Args:
         tensor (torch.Tensor): MX4 packed tensor with total elements (M / 2 + M / groupsize)
         group_size (int): Compute scale in chunks of group_size.
         use_triton (bool): If set, use triton quantization, otherwise cuda.
         ebits (int): Number of exponent bits in target mx4 format.
         mbits (int): Number of mantissa bits in target mx4 format.
+        output_dtype (Optional[SparseType]): Output dtype (FP32 or BF16).
+            Defaults to None (FP32) for backward compatibility.
     Return:
-        output: FP32 tensor with total elements (M).
+        output: Tensor with dtype matching output_dtype and total elements (M).
     """
+    # Validate output_dtype
+    supported_dtypes = {SparseType.FP32, SparseType.BF16}
+    if output_dtype is not None and output_dtype not in supported_dtypes:
+        raise ValueError(
+            f"output_dtype must be one of {supported_dtypes}, got {output_dtype}. "
+            f"FP16 is not supported due to potential overflow/underflow with MX4's wide exponent range. "
+            f"Use BF16 for memory savings with same dynamic range as FP32."
+        )
+    target_dtype = (
+        output_dtype.as_dtype() if output_dtype is not None else torch.float32
+    )
     # Accelerated MX4 dequantize is only available on cuda, if input is on cpu, use python.
-    if not tensor.is_cuda:
-        return py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+    if not tensor.is_cuda and not tensor.is_mtia:
+        result = py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        return result.to(target_dtype) if output_dtype is not None else result
     if use_triton:
-        return dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        if tensor.is_mtia:
+            return mtia_dequantize_mx4(
+                tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+            )
+        return triton_dequantize_mx4(
+            tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+        )
     else:
-        return torch.ops.fbgemm.dequantize_mx_cuda(tensor.flatten(), group_size)
+        output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
+        return torch.ops.fbgemm.dequantize_mx_cuda(
+            tensor.flatten(), group_size, output_dtype_int
+        )
 def fp32_to_fp16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:

fbgemm_gpu/runtime_monitor.py CHANGED Viewed

@@ -12,7 +12,7 @@ import logging
 from collections import deque
 from dataclasses import dataclass
 from types import TracebackType
-from typing import Callable, Deque, Optional, Tuple, Type, TypeVar
+from typing import Callable, Optional, TypeVar
 import torch
@@ -49,6 +49,7 @@ class TBEStatsReporter(abc.ABC):
         embedding_id: str = "",
         tbe_id: str = "",
         time_unit: str = "ms",
+        enable_tb_metrics: bool = False,
     ) -> None:
         """
         Report the duration of a timed event.
@@ -63,6 +64,7 @@ class TBEStatsReporter(abc.ABC):
         data_bytes: int,
         embedding_id: str = "",
         tbe_id: str = "",
+        enable_tb_metrics: bool = False,
     ) -> None:
         """
         Report the size of some data amount.
@@ -89,9 +91,10 @@ class StdLogStatsReporter(TBEStatsReporter):
         embedding_id: str = "",
         tbe_id: str = "",
         time_unit: str = "ms",
+        enable_tb_metrics: bool = False,
     ) -> None:
         logging.info(
-            f"[Batch #{iteration_step}][TBE:{tbe_id}][Table:{embedding_id}] The event {event_name} took {duration_ms} {time_unit}"
+            f"[Batch #{iteration_step}][TBE:{tbe_id}][Table:{embedding_id}] The event {event_name} took {duration_ms} {time_unit} with {enable_tb_metrics}"
         )
     def report_data_amount(
@@ -101,9 +104,10 @@ class StdLogStatsReporter(TBEStatsReporter):
         data_bytes: int,
         embedding_id: str = "",
         tbe_id: str = "",
+        enable_tb_metrics: bool = False,
     ) -> None:
         logging.info(
-            f"[Batch #{iteration_step}][TBE:{tbe_id}][Table:{embedding_id}] The event {event_name} used {data_bytes} bytes"
+            f"[Batch #{iteration_step}][TBE:{tbe_id}][Table:{embedding_id}] The event {event_name} used {data_bytes} bytes with {enable_tb_metrics}"
         )
     def __repr__(self) -> str:
@@ -167,7 +171,7 @@ class AsyncSeriesTimerRecordedContext:
     def __exit__(
         self,
-        exc_type: Optional[Type[BaseException]],
+        exc_type: Optional[type[BaseException]],
         exc_val: Optional[BaseException],
         exc_tb: Optional[TracebackType],
     ) -> None:
@@ -187,7 +191,7 @@ class AsyncSeriesTimer:
     """
     def __init__(self, report_functor: Callable[[T, float], None]) -> None:
-        self._events_queue: Deque[Tuple[torch.cuda.Event, torch.cuda.Event, T]] = (
+        self._events_queue: deque[tuple[torch.cuda.Event, torch.cuda.Event, T]] = (
             deque()
         )
         self._active_start_event: Optional[torch.cuda.Event] = None

fbgemm_gpu/sll/__init__.py CHANGED Viewed

@@ -9,12 +9,14 @@
 import torch
+# fmt:skip
 from fbgemm_gpu.sll.cpu import op_registrations as sll_cpu_registrations
 from fbgemm_gpu.sll.meta import op_registrations as sll_meta_registrations
 from fbgemm_gpu.utils import TorchLibraryFragment
 lib = TorchLibraryFragment("fbgemm")
+# fmt:off
 lib.define(
     """sll_jagged_dense_bmm(
         Tensor x,
@@ -170,6 +172,7 @@ lib.define(
     ) -> Tensor
     """
 )
+# fmt:on
 # NOTE: here we register the op for AutogradCUDA/CPU and CUDA/CPU with the same
 # function however, this is not ideal because in the inference case, we don't

fbgemm_gpu/sll/cpu/cpu_sll.py CHANGED Viewed

@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 # pyre-strict
-from typing import Any, Tuple
+from typing import Any
 import torch
@@ -65,7 +65,7 @@ class JaggedDenseBmmCPU(torch.autograd.Function):
     # pyre-fixme
     def backward(
         ctx: Any, grad_output: torch.Tensor  # pyre-ignore
-    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
+    ) -> tuple[torch.Tensor, torch.Tensor, None, None, None]:
         """
         # X = [Sum_B, D]
         # Y = [B, D, T]
@@ -73,7 +73,7 @@ class JaggedDenseBmmCPU(torch.autograd.Function):
         # dX = dZ * YT # [Sum_B, T] * [B, T, D] = [Sum_B, D]
         # dY = XT * dZ # [D, sum_B] * [sum_B, T] = [D, B, T]
         """
-        (x, y, x_offsets) = ctx.saved_tensors
+        x, y, x_offsets = ctx.saved_tensors
         N = ctx.N
         grad_x = cpu_jagged_dense_bmm_kernel(
             grad_output, y.permute(0, 2, 1), x_offsets, N
@@ -128,7 +128,7 @@ class JaggedJaggedBmm(torch.autograd.Function):
     # pyre-fixme
     def backward(
         ctx: Any, grad_output: torch.Tensor  # pyre-ignore
-    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
+    ) -> tuple[torch.Tensor, torch.Tensor, None, None, None]:
         """
         # X = [Sum_B, D]
         # Y = [Sum_B, T]
@@ -136,7 +136,7 @@ class JaggedJaggedBmm(torch.autograd.Function):
         # dXT = dZ * YT -> dX = Y * dZT
         # dY = X * dZ -> X * dZ
         """
-        (x, y, offsets) = ctx.saved_tensors
+        x, y, offsets = ctx.saved_tensors
         N = ctx.N
         grad_x = cpu_jagged_dense_bmm_kernel(
             y, grad_output.permute(0, 2, 1), offsets, N
@@ -172,7 +172,7 @@ def cpu_dense_jagged_cat_jagged_out(
     b: torch.Tensor,
     b_offsets: torch.Tensor,
     max_seq_len: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert a.size(0) == b_offsets.size(0) - 1
     c = torch.empty(b.size(0) + a.size(0), dtype=a.dtype, device=a.device)
     c_offsets = b_offsets + torch.arange(
@@ -368,7 +368,7 @@ class JaggedSoftmaxCPU(torch.autograd.Function):
     # pyre-fixme
     def backward(
         ctx: Any, grad_output: torch.Tensor  # pyre-ignore
-    ) -> Tuple[torch.Tensor, None, None]:
+    ) -> tuple[torch.Tensor, None, None]:
         y, x_offsets = ctx.saved_tensors
         B = x_offsets.size(0) - 1
@@ -923,7 +923,7 @@ class JaggedDenseAddCPU(torch.autograd.Function):
     def backward(
         ctx,  # pyre-ignore
         grad_output: torch.Tensor,
-    ) -> Tuple[torch.Tensor, None, torch.Tensor, None]:
+    ) -> tuple[torch.Tensor, None, torch.Tensor, None]:
         (offsets,) = ctx.saved_tensors
         grad_dense = torch.ops.fbgemm.jagged_to_padded_dense(
             grad_output, [offsets], [ctx.max_seq_len]

fbgemm_gpu/sll/triton/__init__.py CHANGED Viewed

@@ -10,19 +10,16 @@
 from fbgemm_gpu.sll.triton.triton_dense_jagged_cat_jagged_out import (
     dense_jagged_cat_jagged_out,
 )
 from fbgemm_gpu.sll.triton.triton_jagged2_to_padded_dense import (  # noqa F401
     jagged2_to_padded_dense,
     Jagged2ToPaddedDense,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_bmm import (  # noqa F401
     jagged_dense_bmm,
     jagged_jagged_bmm,
     JaggedDenseBmm,  # noqa F401
     JaggedJaggedBmm,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_bmm_jagged_out import (  # noqa F401
     array_jagged_bmm_jagged_out,
     ArrayJaggedBmmNopadding,  # noqa F401
@@ -31,38 +28,31 @@ from fbgemm_gpu.sll.triton.triton_jagged_bmm_jagged_out import (  # noqa F401
     triton_array_jagged_bmm_jagged_out,  # noqa F401
     triton_jagged_jagged_bmm_jagged_out,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_dense_elementwise_add import (  # noqa F401
     jagged_dense_elementwise_add,
     JaggedDenseAdd,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_dense_elementwise_mul_jagged_out import (  # noqa F401
     jagged_dense_elementwise_mul_jagged_out,
     JaggedDenseElementwiseMul,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_dense_flash_attention import (  # noqa F401
     jagged_dense_flash_attention,
     JaggedDenseFlashAttention,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_flash_attention_basic import (  # noqa F401
     jagged_flash_attention_basic,
     JaggedFlashAttentionBasic,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_self_substraction_jagged_out import (
     triton_jagged_self_substraction_jagged_out,
 )
 from fbgemm_gpu.sll.triton.triton_jagged_softmax import (  # noqa F401
     jagged2_softmax,
     Jagged2Softmax,  # noqa F401
     jagged_softmax,
     JaggedSoftmax,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_multi_head_jagged_flash_attention import (  # noqa F401
     multi_head_jagged_flash_attention,
     MultiHeadJaggedFlashAttention,  # noqa F401

fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # pyre-unsafe
-from typing import Tuple
 import torch
 import triton
@@ -196,9 +195,9 @@ class Jagged2ToPaddedDense(torch.autograd.Function):
     # pyre-fixme
     def backward(
         ctx, grad_output: torch.Tensor
-    ) -> Tuple[torch.Tensor, None, None, None]:
+    ) -> tuple[torch.Tensor, None, None, None]:
         max_length = ctx.max_length
-        (lengths, offsets) = ctx.saved_tensors
+        lengths, offsets = ctx.saved_tensors
         grad_in = padded_dense_to_jagged2_fwd(grad_output, lengths, offsets, max_length)
         return (grad_in, None, None, None)

fbgemm_gpu/sll/triton/triton_jagged_bmm.py CHANGED Viewed

@@ -326,7 +326,7 @@ class JaggedDenseBmm(torch.autograd.Function):
         # logging.info(f"Jagged bmm backward called")
-        (x, y, x_offsets) = ctx.saved_tensors
+        x, y, x_offsets = ctx.saved_tensors
         N = ctx.N
         grad_x = triton_jagged_dense_bmm(
             grad_output, y.permute(0, 2, 1), x_offsets, N, allow_tf32=ctx.allow_tf32
@@ -369,7 +369,7 @@ class JaggedJaggedBmm(torch.autograd.Function):
         # dXT = dZ * YT -> dX = Y * dZT
         # dY = X * dZ -> X * dZ
         """
-        (x, y, offsets) = ctx.saved_tensors
+        x, y, offsets = ctx.saved_tensors
         N = ctx.N
         grad_x = triton_jagged_dense_bmm(
             y, grad_output.permute(0, 2, 1), offsets, N, allow_tf32=ctx.allow_tf32

fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py CHANGED Viewed

@@ -8,6 +8,7 @@
 import torch
+# fmt:skip
 from fbgemm_gpu.triton.jagged.triton_jagged_tensor_ops import (
     dense_to_jagged,
     jagged_to_dense,

fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # pyre-unsafe
-from typing import Tuple
 import torch
 import triton
@@ -171,7 +170,7 @@ def jagged_dense_flash_attention_fwd(
     jagged_offsets,
     max_seq_len,
     allow_tf32=False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Q: jagged tensor, [sum_B, D]
     K: dense tensor, [B, D, T]
@@ -192,7 +191,7 @@ def jagged_dense_flash_attention_fwd(
     assert Q.size() == V.size(), "incompatible dimensions for Q and V"
     assert jagged_offsets.is_contiguous(), "jagged_offsets must be contiguous"
-    (B, D, T) = K.size()
+    B, D, T = K.size()
     assert D > 0 and (D & (D - 1)) == 0, "D needs to be a power of two"
     attn_out = torch.zeros(B, T, D, dtype=Q.dtype, device=Q.device)
@@ -650,7 +649,7 @@ def jagged_dense_flash_attention_bwd(
     jagged_offsets,
     max_seq_len,
     allow_tf32=False,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Q: jagged tensor, [sum_B, D]
     K: dense tensor, [B, D, T]
@@ -668,7 +667,7 @@ def jagged_dense_flash_attention_bwd(
     if not do.is_contiguous():
         do = do.contiguous()
-    (B, D, T) = K.size()
+    B, D, T = K.size()
     BLOCK_T = 32
     BLOCK_L = 32
     BLOCK_D = D
@@ -812,7 +811,7 @@ class JaggedDenseFlashAttention(torch.autograd.Function):
     # pyre-fixme
     def backward(
         ctx, do: torch.Tensor
-    ) -> Tuple[
+    ) -> tuple[
         torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, None, None, None
     ]:
         Q, K, V, attn_bias, jagged_offsets, lse, attn_out = ctx.saved_tensors

fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # pyre-unsafe
-from typing import Tuple
 import torch
 import triton
@@ -607,7 +606,7 @@ class JaggedFlashAttentionBasic(torch.autograd.Function):
     # pyre-fixme
     def backward(
         ctx, grad_output: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None, None, None]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None, None, None]:
         (
             jagged_Q,
             jagged_K,

fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # pyre-unsafe
-from typing import Tuple
 import torch
 import triton
@@ -688,7 +687,7 @@ class MultiHeadJaggedFlashAttention(torch.autograd.Function):
     # pyre-fixme
     def backward(
         ctx, grad_output: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None, None]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None, None]:
         (
             jagged_Q,
             jagged_K,