PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py ADDED Viewed

@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+@dataclass
+class ScaledMMLinearLayerConfig:
+    is_channelwise: bool
+    is_static_input_scheme: bool
+    input_symmetric: bool
+class ScaledMMLinearKernel(ABC):
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError
+    @classmethod
+    @abstractmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        raise NotImplementedError
+    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
+                 w_s_param_name: str, i_s_param_name: str,
+                 i_zp_param_name: str, azp_adj_param_name: str) -> None:
+        assert self.can_implement(c)
+        self.config = c
+        self.w_q_name = w_q_param_name
+        self.w_s_name = w_s_param_name
+        self.i_s_name = i_s_param_name
+        self.i_zp_name = i_zp_param_name
+        self.azp_adj_name = azp_adj_param_name
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
+    def _get_weight_params(
+            self, layer: torch.nn.Module) -> Tuple[
+                torch.Tensor,  # weight
+                torch.Tensor,  # weight_scale
+                Optional[torch.Tensor],  # input_scale,
+                Optional[torch.Tensor],  # input_zp
+                Optional[torch.Tensor],  # azp_adj
+            ]:
+        return (
+            getattr(layer, self.w_q_name),
+            getattr(layer, self.w_s_name),
+            getattr(layer, self.i_s_name),
+            getattr(layer, self.i_zp_name),
+            getattr(layer, self.azp_adj_name),
+        )

vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py ADDED Viewed

@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Dict, List, Optional, Type
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
+    CutlassScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
+    ScaledMMLinearKernel, ScaledMMLinearLayerConfig)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import (
+    TritonScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
+    XLAScaledMMLinearKernel)
+from vllm.platforms import PlatformEnum, current_platform
+# in priority/performance order (when available)
+_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
+    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
+    PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
+    PlatformEnum.TPU: [XLAScaledMMLinearKernel],
+}
+def choose_scaled_mm_linear_kernel(
+        config: ScaledMMLinearLayerConfig,
+        compute_capability: Optional[int] = None
+) -> Type[ScaledMMLinearKernel]:
+    """
+    Choose an ScaledMMLinearKernel that can implement the given config for the
+    given compute capability. Attempts to choose the best kernel in terms of
+    performance.
+    Args:
+        config (ScaledMMLinearLayerConfig): Description of the linear layer
+            to be implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+            the target device, if None uses `current_platform` to get the
+            compute capability. Defaults to None.
+    Raises:
+        ValueError: If no kernel can implement the given config.
+    Returns:
+        Type[ScaledMMLinearKernel]: Chosen kernel.
+    """
+    if compute_capability is None:
+        _cc = current_platform.get_device_capability()
+        if _cc is not None:
+            compute_capability = _cc[0] * 10 + _cc[1]
+    failure_reasons = []
+    for kernel in _POSSIBLE_KERNELS[current_platform._enum]:
+        if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "")\
+            .split(","):
+            failure_reasons.append(
+                f' {kernel.__name__} disabled by environment variable')
+            continue
+        # If the current platform uses compute_capability,
+        # make sure the kernel supports the compute cability.
+        if compute_capability is not None:
+            kernel_min_capability = kernel.get_min_capability()
+            if (kernel_min_capability is not None
+                    and kernel_min_capability > compute_capability):
+                failure_reasons.append(
+                    f"{kernel.__name__} requires capability "
+                    f"{kernel_min_capability}, current compute capability "
+                    f"is {compute_capability}")
+                continue
+        can_implement, failure_reason = kernel.can_implement(config)
+        if can_implement:
+            return kernel
+        else:
+            failure_reasons.append(
+                f' {kernel.__name__} cannot implement due to: {failure_reason}'
+            )
+    raise ValueError(
+        "Failed to find a kernel that can implement the "\
+        "ScaledMM linear layer. Reasons: \n"
+        + '\n'.join(failure_reasons))

vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py ADDED Viewed

@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import torch
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if not current_platform.is_rocm():
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "currently supported on non-ROCm platform.")
+        try:
+            import aiter  # noqa: F401 # deliberately attempt to import aiter
+        except Exception:
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "installed on ROCm.")
+        # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
+        if not (
+            envs.VLLM_ROCM_USE_AITER_LINEAR \
+            and envs.VLLM_ROCM_USE_AITER
+        ):
+            return (False, "AiterScaledMMLinearKernel is disabled. " +
+                    "Enable by setting `VLLM_ROCM_USE_AITER=1` " +
+                    "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " +
+                    "`VLLM_ROCM_USE_AITER_LINEAR` default is True.")
+        if not c.input_symmetric:
+            return (False,
+                    "AiterScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        `AiterScaledMMLinearKernel` implements a fused version of
+            `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+        where scale_a * a and scale_b * b are implemented using numpy-style
+        broadcasting.
+        Currently only support per-tensor-per-tensor GEMM
+        and per-token-per-channel GEMM through AITER
+        w8a8 scaled gemm. `AiterScaledMMLinearKernel` also does not support
+        ATIER block scaled GEMM and mix-precision GEMM.
+        """
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        assert symmetric, ("AiterScaledMMLinearKernel only supports"
+                           " symmetric quantization.")
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+        assert x_zp is None, ("AiterScaledMMLinearKernel only supports"
+                              " symmetric quantization.")
+        out_dtype = x.dtype
+        assert (w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0)
+        assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+        assert bias is None or bias.shape[0] == w_q.shape[
+            1] and bias.dtype == out_dtype
+        m = x_q.shape[0]  # a
+        n = w_q.shape[1]  # b
+        per_tensor_scale_a = (x_s.numel() == 1)
+        per_tensor_scale_b = (w_s.numel() == 1)
+        per_token_scale_a = (x_s.numel() == m)
+        per_channel_scale_b = (w_s.numel() == n)
+        # @TODO:
+        # Maybe broadcast the per-tensor-scale into per-channel-scale
+        # if one of the scale is a per-channel-scale.
+        # For now, it only supports:
+        # - per-tensor-per-tensor a8w8 scaled GEMM, and
+        # - per-token-per-channel a8w8 scaled GEMM
+        assert ((per_tensor_scale_a and per_tensor_scale_b)
+                or (per_token_scale_a and per_channel_scale_b)), (
+                    "Currently only support per-tensor-per-tensor GEMM " +
+                    " and per-token-per-channel GEMM through AITER"
+                    " w8a8 scaled gemm. `AiterScaledMMLinearKernel` " +
+                    "does not support AITER block scaled GEMM.")
+        from aiter import gemm_a8w8_CK
+        # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+        # a to be [M, K]
+        # b to be [N, K]
+        # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+        return gemm_a8w8_CK(x_q, w_q.t(), x_s, w_s, bias).to(out_dtype)

vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py ADDED Viewed

@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
+            return False, "CutlassScaledMM requires running on CUDA or CPU."
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Cutlass kernels need transposed weight.
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+        # WEIGHT SCALE
+        # Cutlass kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+                # AZP loaded as int8 but used as int32
+                azp = (int8_traits.min -
+                       range_min / scale).to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+        # azp_adj is the AZP adjustment term, used to account for weights.
+        # It does not depend on scales or azp, so it is the same for
+        # static and dynamic quantization.
+        # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+        # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+        if not self.config.input_symmetric:
+            weight = getattr(layer, self.w_q_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.config.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                # in the per-tensor case
+                azp_adj = getattr(layer, self.i_zp_name) * azp_adj
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+        if x_zp is not None:
+            # Currently, static is always per-tensor and dynamic is per-token
+            static = i_zp is not None
+            azp = None if static else x_zp
+            return ops.cutlass_scaled_mm_azp(x_q,
+                                             w_q,
+                                             scale_a=x_s,
+                                             scale_b=w_s,
+                                             out_dtype=x.dtype,
+                                             azp_adj=azp_adj,
+                                             azp=azp,
+                                             bias=bias)
+        return ops.cutlass_scaled_mm(x_q,
+                                     w_q,
+                                     scale_a=x_s,
+                                     scale_b=w_s,
+                                     out_dtype=x.dtype,
+                                     bias=bias)

vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py ADDED Viewed

@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+import torch
+from vllm.platforms import current_platform
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if current_platform.is_cpu():
+            return (
+                False,
+                "TritonScaledMMLinearKernel requires Triton which is not " +
+                "currently supported on CPU.")
+        if not c.input_symmetric:
+            return (False,
+                    "TritonScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return super().apply_weights(layer, x, bias)

vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py ADDED Viewed

@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+from typing import Optional, Tuple
+import torch
+from functorch.experimental.control_flow import cond  # noqa: F401
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.platforms import current_platform
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise NotImplementedError(
+            "TPU platform does have a concept of compute capability, "
+            "this method should not be called.")
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if not current_platform.is_tpu():
+            return False, "ScaledMMXLA requires running on TPU."
+        if c.is_static_input_scheme:
+            return False, "ScaledMMXLA requires dynamic activation scales."
+        if not c.input_symmetric:
+            return False, "ScaledMMXLA requires symmetric activation scales."
+        if not c.is_channelwise:
+            return False, "ScaledMMXLA requires channelwise weight scales"
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # [out, in] (different than cutlass_scaled_mm)
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(weight.data, requires_grad=False))
+        # WEIGHT SCALE
+        # XLA kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        # [out_channel,] (different than cutlass_scaled_mm)
+        weight_scale = weight_scale.squeeze(-1)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+        # Only support symmetric dynamic activation quantization.
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+        # Filter warning for cond usage in apply_weights. It is okay
+        # to specialize the graph since bias is not dynamic.
+        warnings.filterwarnings(
+            "ignore",
+            message=
+            "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."  # noqa: E501
+        )
+    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x
+    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+        return x + bias
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        out = torch.ops.xla.quantized_matmul(x,
+                                             w_q,
+                                             w_s,
+                                             zero_point=None,
+                                             block_size=-1,
+                                             int4_weight=False,
+                                             quantize_activation=True)
+        # `quantized_matmul` output is fp32, cast it down to bf16 for perf
+        out = out.to(x.dtype)
+        # Explicitly capture control flow to make dynamo happy.
+        # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
+        return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])

vllm/model_executor/layers/quantization/kv_cache.py ADDED Viewed

@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.platforms import current_platform
+logger = init_logger(__name__)
+class BaseKVCacheMethod(QuantizeMethodBase):
+    """
+    Quant method that adds `_k_scale` and `_v_scale` attributes to the
+    Attention layer to support loading those scaling factors from checkpoints.
+    The k/v_scale will be used to:
+        - quantize k/v_cache entries before saving them to the cache
+        - dequantize k/v_cache entries before fetching them from the cache
+    :param quant_config: the appropriate QuantizationConfig
+    """
+    def __init__(self, quant_config: QuantizationConfig):
+        self.quant_config = quant_config
+    def create_weights(self, layer: torch.nn.Module):
+        """
+        Create "weight" (aka q_scale, k_scale and v_scale)
+        for an attention layer.
+        """
+        # Initialize the Q and KV cache scales to -1.0, an invalid value.
+        # If the q and k/v_scales appear in the checkpoint, it will be
+        # overwritten when loading weights.
+        layer.q_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+        layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+        layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
+        # Initialize P = softmax(QK^T) scales
+        layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                              requires_grad=False)
+    def apply(self, layer: torch.nn.Module) -> torch.Tensor:
+        raise RuntimeError(
+            f"{self.__class__.__name__}.apply should not be called.")
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
+        # regardless whether the kv-scale is available in the checkpoint.
+        # No need to process kv scales after loading if we are going to
+        # calculate them on the fly.
+        if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales:
+            if layer.k_scale > 0.0 and layer.v_scale > 0.0:
+                # We prefer to use separate k_scale and v_scale if present
+                k_scale = layer.k_scale.to("cpu").tolist()
+                v_scale = layer.v_scale.to("cpu").tolist()
+                if current_platform.is_fp8_fnuz():
+                    k_scale *= 2
+                    v_scale *= 2
+            elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
+                # If no scales were loaded (both scales are invalid negative
+                # values), use the default value of 1.0
+                k_scale = 1.0
+                v_scale = 1.0
+            else:
+                # If we find a single kv_scale in the checkpoint, we remap
+                # kv_scale to k_scale during weight loading, and duplicate
+                # k_scale to v_scale here
+                assert layer.k_scale > 0.0
+                scale_to_duplicate = max(layer.k_scale, layer.v_scale)
+                k_scale = scale_to_duplicate.to("cpu").tolist()
+                v_scale = scale_to_duplicate.to("cpu").tolist()
+                if current_platform.is_fp8_fnuz():
+                    k_scale *= 2
+                    v_scale *= 2
+            if not isinstance(k_scale, float) or not isinstance(
+                    v_scale, float):
+                raise ValueError("Only support per-tensor scaling factor "
+                                 "for fp8 KV cache")
+            if layer.q_scale < 0.0:
+                logger.warning_once(
+                    "Checkpoint does not provide a q scaling factor. "
+                    "Setting it to k_scale. This only matters for "
+                    "the flash-attn backend.")
+                layer._q_scale.copy_(k_scale)
+            # These are used in the final Attention.forward()
+            layer._k_scale.copy_(k_scale)
+            layer._v_scale.copy_(v_scale)
+            layer._k_scale_float = k_scale
+            layer._v_scale_float = v_scale
+            if (k_scale == 1.0 and v_scale == 1.0
+                    and "e5m2" not in layer.kv_cache_dtype):
+                logger.warning_once(
+                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
+                    "may cause accuracy issues. Please make sure k/v_scale "
+                    "scaling factors are available in the fp8 checkpoint.")
+        if layer.q_scale > 0.0:
+            q_scale = layer.q_scale
+            if current_platform.is_fp8_fnuz():
+                q_scale *= 2
+            layer.calculate_kv_scales = False
+        else:
+            q_scale = 1.0
+        if layer.prob_scale > 0.0:
+            prob_scale = layer.prob_scale
+            if current_platform.is_fp8_fnuz():
+                prob_scale *= 2
+        else:
+            prob_scale = 1.0
+        is_singleton_float = lambda x: isinstance(x, float) or isinstance(
+            x, torch.Tensor) and x.numel() == 1 and x.is_floating_point()
+        if not is_singleton_float(q_scale) or not is_singleton_float(
+                prob_scale):
+            raise ValueError("Only support per-tensor scaling factor"
+                             "for fp8-quantized Q/prob")
+        # These are used in the final Attention.forward()
+        layer._q_scale.copy_(q_scale)
+        layer._prob_scale.copy_(prob_scale)
+        if q_scale == 1.0 or prob_scale == 1.0:
+            logger.warning_once(
+                f"Using Q scale {q_scale} and prob scale {prob_scale} "
+                "with fp8 attention. This may cause accuracy issues. "
+                "Please make sure Q/prob scaling factors are "
+                "available in the fp8 checkpoint.")
+        del layer.k_scale
+        del layer.v_scale
+        del layer.q_scale
+        del layer.prob_scale