PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/model_executor/layers/quantization/inc.py ADDED Viewed

@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Intel Gaudi supports quantization of various modules and functions,
+# including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`.
+# During model loading,
+# INC will patch layers with quantization/dequantization operators.
+# Meanwhile, INC will convert original weight to target datatype
+# and loading to target device.
+# static scaling should be provided through Quant_CONFIG:
+# `QUANT_CONFIG` is an environment variable,
+# that points to the measurement or quantization JSON config file.
+# The measurement configuration file is used during the calibration procedure,
+# to collect measurements for a given model.
+# The quantization configuration is used during inference.
+# For more information, please refer to:
+# https://docs.habana.ai/en/v1.21.1/PyTorch/vLLM_Inference/vLLM_FP8_Inference.html
+from typing import Any, Optional
+import torch
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE,
+    UnquantizedFusedMoEMethod,
+)
+from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+class INCConfig(QuantizationConfig):
+    """Config class for FP8 using Intel Neural Compressor."""
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "inc"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16]
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "INCConfig":
+        raise AssertionError
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, FusedMoE):
+            return UnquantizedFusedMoEMethod(layer.moe_config)
+        return None
+    @classmethod
+    def get_min_capability(cls) -> int:
+        raise AssertionError
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []

vllm/model_executor/layers/quantization/input_quant_fp8.py ADDED Viewed

@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn.functional as F
+from vllm import _custom_ops as ops
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.platforms import current_platform
+# Using the default value (240.0) from pytorch will cause accuracy
+# issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm.
+_FP8_DTYPE = current_platform.fp8_dtype()
+_FP8_FINFO = torch.finfo(_FP8_DTYPE)
+_FP8_MAX = 224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.max
+_FP8_MIN = -224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.min
+_FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0)
+@CustomOp.register("quant_fp8")
+class QuantFP8(CustomOp):
+    """
+    Quantize input tensor to FP8 (per-tensor, per-token, or per-group).
+    This CustomOp supports both static and dynamic quantization.
+    """
+    def __init__(
+        self,
+        static: bool,
+        group_shape: GroupShape,
+        num_token_padding: int | None = None,
+        column_major_scales: bool = False,
+        use_ue8m0: bool | None = None,  # for Torch compile
+    ):
+        """
+        :param static: static or dynamic quantization
+        :param group_shape: quantization group shape (PER_TOKEN, PER_TENSOR,
+            or arbitrary block size)
+        :param num_token_padding: Pad the token dimension of output to this
+            size
+        :param column_major_scales: For group quantization, output scales in
+            column major format
+        """
+        super().__init__()
+        self.static = static
+        self.group_shape = group_shape
+        self.num_token_padding = num_token_padding
+        self.column_major_scales = column_major_scales
+        self.use_ue8m0 = use_ue8m0
+        self.is_group_quant = group_shape.is_per_group()
+        if self.is_group_quant:
+            assert not static, "Group quantization only supports dynamic mode"
+            self.group_size = group_shape.col
+        else:
+            assert group_shape in {GroupShape.PER_TOKEN, GroupShape.PER_TENSOR}
+            assert not static or group_shape == GroupShape.PER_TENSOR, (
+                "Only per-tensor scales supported for static quantization."
+            )
+            self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.is_group_quant:
+            assert scale is None, "Group quantization is always dynamic"
+            from vllm.model_executor.layers.quantization.utils import fp8_utils
+            return fp8_utils.per_token_group_quant_fp8(
+                x,
+                group_size=self.group_size,
+                column_major_scales=self.column_major_scales,
+                dtype=_FP8_DTYPE,
+                use_ue8m0=self.use_ue8m0,
+            )
+        assert (scale is not None) == self.static
+        assert scale_ub is None or (
+            not self.static
+            and self.group_shape == GroupShape.PER_TOKEN
+            and scale_ub.numel() == 1
+        )
+        return ops.scaled_fp8_quant(
+            x,
+            scale,
+            num_token_padding=self.num_token_padding,
+            scale_ub=scale_ub,
+            use_per_token_if_dynamic=self.use_per_token_if_dynamic,
+        )
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
+    ):
+        if self.is_group_quant:
+            assert scale is None, "Group quantization is always dynamic"
+            return self._quantize_group_native(x)
+        assert (scale is not None) == self.static
+        assert scale_ub is None or (
+            not self.static
+            and self.group_shape == GroupShape.PER_TOKEN
+            and scale_ub.numel() == 1
+        )
+        if scale is None:
+            if self.group_shape == GroupShape.PER_TOKEN:
+                x_max, _ = x.abs().max(dim=-1)
+                x_max = x_max.unsqueeze(-1).to(torch.float32)
+                if scale_ub is not None:
+                    x_max = x_max.clamp(max=scale_ub)
+            else:
+                x_max = x.abs().max().unsqueeze(-1).to(torch.float32)
+            scale = (x_max / _FP8_MAX).clamp(min=_FP8_MIN_SCALING_FACTOR)
+        # Even for dynamic per-token scales,
+        # reciprocal performs slightly better than division
+        out = x.to(torch.float32) * scale.reciprocal()
+        out = out.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE)
+        # This currently generates an extra Triton kernel in compilation.
+        # Fortunately, we don't use padding if compiling.
+        # TODO(luka): benchmark torch._scaled_mm to hopefully remove padding
+        #  in general.
+        if self.num_token_padding is not None:
+            padding = max(self.num_token_padding - out.size(0), 0)
+            out = F.pad(out, (0, 0, 0, padding), "constant", 0.0)
+        return out, scale
+    def _quantize_group_native(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_shape = x.shape
+        hidden_dim = x.shape[-1]
+        num_groups = (hidden_dim + self.group_size - 1) // self.group_size
+        padded_dim = num_groups * self.group_size
+        if padded_dim != hidden_dim:
+            padding = padded_dim - hidden_dim
+            x = F.pad(x, (0, padding), mode="constant", value=0.0)
+        x_grouped = x.view(-1, num_groups, self.group_size)
+        absmax = x_grouped.abs().max(dim=-1, keepdim=True)[0].float()
+        scales_raw = absmax / _FP8_MAX
+        if self.use_ue8m0:
+            scales_raw = torch.exp2(torch.ceil(torch.log2(scales_raw)))
+        scales = (scales_raw).clamp(min=_FP8_MIN_SCALING_FACTOR)
+        x_scaled = x_grouped / scales
+        x_quant = x_scaled.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE)
+        x_quant = x_quant.view(-1, padded_dim)
+        if padded_dim != hidden_dim:
+            x_quant = x_quant[..., :hidden_dim]
+        x_quant = x_quant.view(orig_shape)
+        scales = scales.squeeze(-1)
+        scales = scales.reshape(orig_shape[:-1] + (num_groups,))
+        if self.column_major_scales:
+            scales = scales.transpose(-2, -1).contiguous().transpose(-1, -2)
+        return x_quant, scales

vllm/model_executor/layers/quantization/ipex_quant.py ADDED Viewed

@@ -0,0 +1,467 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+from typing import Any, Optional
+import torch
+from packaging import version
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+from vllm._ipex_ops import ipex_ops as ops
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoEMethodBase,
+    FusedMoeWeightScaleSupported,
+)
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
+from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+MIN_IPEX_VERSION = "2.6.0"
+class IPEXConfig(QuantizationConfig):
+    """INT8 quantization config class using IPEX for the CPU/XPU backend,
+    including AWQ, GPTQ.
+    """
+    IPEX_QUANT_METHOD_MAP = {
+        "awq": 1,
+        "gptq": 0,
+    }
+    def __init__(
+        self,
+        method: str,
+        weight_bits: int,
+        group_size: int,
+        modules_to_not_convert: list[str] | None = None,
+        desc_act: bool | None = None,
+        lm_head_quantized: bool | None = None,
+        is_sym: bool | None = None,
+    ) -> None:
+        super().__init__()
+        self.method = method
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.is_sym = is_sym
+        self.pack_factor = 32 // self.weight_bits
+        if self.weight_bits not in [4]:
+            raise ValueError(
+                f"IPEX quantization supports weight bits [4], "
+                f"but got {self.weight_bits}."
+            )
+        if self.method not in ["awq", "gptq"]:
+            raise ValueError(
+                f"IPEX quantization supports [awq, gptq], but got {self.method}."
+            )
+    def __repr__(self) -> str:
+        return (
+            f"IPEXConfig(method={self.method},"
+            f"weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size})"
+        )
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "ipex"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return -1
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "IPEXConfig":
+        method = cls.get_from_keys(config, ["quant_method"]).lower()
+        if method == "awq":
+            weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
+            group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None
+            )
+            is_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False)
+            return cls(
+                method,
+                weight_bits,
+                group_size,
+                modules_to_not_convert,
+                False,
+                False,
+                is_sym,
+            )
+        # otherwise for gptq
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
+        is_sym = cls.get_from_keys_or(config, ["sym"], default=True)
+        return cls(
+            method, weight_bits, group_size, [], desc_act, lm_head_quantized, is_sym
+        )
+    @classmethod
+    def override_quantization_method(
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
+        if not current_platform.is_cpu() and not current_platform.is_xpu():
+            return None
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+        if quant_method in ["awq", "gptq"]:
+            return cls.get_name()
+        return None
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["LinearMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if self.method == "awq":
+                if is_layer_skipped(
+                    prefix, self.modules_to_not_convert, self.packed_modules_mapping
+                ):
+                    return UnquantizedLinearMethod()
+                return IPEXAWQLinearMethod(self)
+            if self.method == "gptq":
+                return IPEXGPTQLinearMethod(self)
+        return None
+class IPEXGPTQLinearMethod(GPTQLinearMethod):
+    """GPTQ linear method using IPEX for the CPU/XPU backend."""
+    def __init__(self, quant_config: IPEXConfig):
+        self.quant_config = quant_config  # type: ignore
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        bias = layer.bias if not layer.skip_bias_add else None
+        try:
+            import intel_extension_for_pytorch as ipex
+            if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION):
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method."
+            ) from err
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
+        assert isinstance(self.quant_config, IPEXConfig)
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.quant_config.group_size,
+        )
+        layer.ipex_output_size = layer.qweight.shape[-1]
+        g_idx = layer.g_idx if self.quant_config.desc_act else None
+        layer.ipex_qlinear = (
+            ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight(
+                layer.qweight,
+                layer.scales,
+                layer.qzeros,
+                layer.qweight.size(0),
+                layer.ipex_output_size,
+                qconfig=qconfig,
+                g_idx=g_idx,
+                bias=bias,
+                group_size=self.quant_config.group_size,
+                quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"],
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
+            )
+        )
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size,))
+class IPEXAWQLinearMethod(AWQLinearMethod):
+    """AWQ linear method using IPEX for the CPU/XPU backend."""
+    def __init__(self, quant_config: IPEXConfig):
+        self.quant_config = quant_config  # type: ignore
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer=layer)
+        bias = layer.bias if not layer.skip_bias_add else None
+        try:
+            import intel_extension_for_pytorch as ipex
+            if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION):
+                raise ImportError(
+                    "intel_extension_for_pytorch version is "
+                    "wrong. Please install "
+                    f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install "
+                f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via "
+                f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`"
+                " to use IPEX-AWQ linear method."
+            ) from err
+        # Using the compute dtype (lowp_mode) as INT8 to leverage instructions
+        # with better performance.
+        lowp_mode = ipex.quantization.WoqLowpMode.INT8
+        # The weight will be de-packed from INT4 to INT8.
+        weight_dtype = ipex.quantization.WoqWeightDtype.INT4
+        # The float activation will be quantized (dynamic, per-token) to INT8.
+        act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
+        assert isinstance(self.quant_config, IPEXConfig)
+        qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+            weight_dtype=weight_dtype,
+            lowp_mode=lowp_mode,
+            act_quant_mode=act_quant_mode,
+            group_size=self.quant_config.group_size,
+        )
+        layer.ipex_output_size = layer.qweight.size(1) * self.quant_config.pack_factor
+        layer.ipex_qlinear = (
+            ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight(
+                layer.qweight,
+                layer.scales,
+                layer.qzeros,
+                layer.qweight.size(0),
+                layer.ipex_output_size,
+                qconfig=qconfig,
+                bias=bias,
+                group_size=self.quant_config.group_size,
+                quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"],  # type: ignore
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
+            )
+        )
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = layer.ipex_qlinear(reshaped_x)
+        return out.reshape(x.shape[:-1] + (layer.ipex_output_size,))
+class XPUFp8LinearMethod(Fp8LinearMethod):
+    def __init__(self, quant_config: Fp8Config):
+        super().__init__(quant_config)
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # If checkpoint not serialized fp8, quantize the weights.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
+            # Update the layer with the new values.
+            layer.weight = Parameter(qweight, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.input_scale = None
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        weight = layer.weight.data
+        weight_scale = layer.weight_scale.data
+        output = torch.ops.torch_ipex.fp8_gemm_w8a16(
+            x, weight, True, weight_scale, bias
+        )
+        return output
+class XPUFp8MoEMethod(FusedMoEMethodBase):
+    def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
+        super().__init__(layer.moe_config)
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        # INPUT_SCALES
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            fp8_dtype = current_platform.fp8_dtype()
+            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    layer.local_num_experts,
+                    dtype=torch.float32,
+                    device=w13_weight.device,
+                ),
+                requires_grad=False,
+            )
+            for expert in range(layer.local_num_experts):
+                w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
+                    ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+                )
+                w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
+                    ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+                )
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+        import intel_extension_for_pytorch as ipex
+        ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
+        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+            layer.w13_weight,
+            layer.w2_weight,
+            w1_scale_inv=layer.w13_weight_scale,
+            w2_scale_inv=layer.w2_weight_scale,
+            a1_scale_inv=layer.w13_input_scale,
+            a2_scale_inv=layer.w2_input_scale,
+            use_prepack=True,
+            experts_start_id=ep_rank_start,
+        )
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return None
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: torch.Tensor | None = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function=custom_routing_function,
+        )

vllm/model_executor/layers/quantization/kernels/__init__.py ADDED Viewed

File without changes