PyPI - vllm-cpu-avx512bf16 - Versions diffs - 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-avx512bf16 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1175) hide show

vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py ADDED Viewed

@@ -0,0 +1,372 @@
+# SPDX-License-Identifier: Apache-2.0
+from enum import IntEnum
+from functools import cache
+from typing import Optional
+import torch
+from vllm import envs
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+class QuantMethod(IntEnum):
+    # This allows interfacing with AITER QuantType Enum
+    # without importing the QuantType from AITER globally.
+    # Note that these quantization methods are
+    # supported in AITER package. However,
+    # not all are used in this module.
+    NO = 0  # a16w16
+    PER_TENSOR = 1  # w8a8 (pre_Tensor)
+    PER_TOKEN = 2  # w8a8/w8a4 (per_Token)
+    BLOCK_1X128 = 3  # block quantized w8a8 (per_1x128)
+    BLOCK_128x128 = 4  # block quantized w8a8 (per_128x128)
+class ActivationMethod(IntEnum):
+    # This allows interfacing with AITER ActivationType enum
+    # without importing the ActivationType enum from AITER globally.
+    SILU = 0
+    GELU = 1
+@cache
+def is_rocm_aiter_moe_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_MOE \
+        and envs.VLLM_ROCM_USE_AITER
+def rocm_aiter_asm_moe_tkw1_impl(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+    activation = ActivationType(activation_method)
+    return asm_moe_tkw1(hidden_states,
+                        w1,
+                        w2,
+                        topk_weights,
+                        topk_ids,
+                        fc1_scale=fc1_scale,
+                        fc2_scale=fc2_scale,
+                        fc1_smooth_scale=fc1_smooth_scale,
+                        fc2_smooth_scale=fc2_smooth_scale,
+                        a16=a16,
+                        per_tensor_quant_scale=per_tensor_quant_scale,
+                        expert_mask=expert_mask,
+                        activation=activation)
+def rocm_aiter_asm_moe_tkw1_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    from aiter import topk_softmax
+    topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                 gating_output, renormalize)
+def rocm_aiter_topk_softmax_fake(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    pass
+def rocm_aiter_biased_grouped_topk_impl(
+        gating_output: torch.Tensor,
+        correction_bias: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+    from aiter import biased_grouped_topk
+    biased_grouped_topk(gating_output, correction_bias, topk_weights, topk_ids,
+                        num_expert_group, topk_group, need_renorm,
+                        routed_scaling_factor)
+def rocm_aiter_biased_grouped_topk_fake(
+        gating_output: torch.Tensor,
+        correction_bias: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_expert_group: int,
+        topk_group: int,
+        need_renorm: bool,
+        routed_scaling_factor: float = 1.0  # mul to topk_weights
+) -> None:
+    pass
+def rocm_aiter_fused_moe_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+    activation = ActivationType(activation_method)
+    quant_type = QuantType(quant_method)
+    return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask,
+                     activation, quant_type, doweight_stage1, w1_scale,
+                     w2_scale, a1_scale, a2_scale)
+def rocm_aiter_fused_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+if current_platform.is_rocm():
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe_tkw1",
+        op_func=rocm_aiter_asm_moe_tkw1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+    direct_register_custom_op(
+        op_name="rocm_aiter_fused_moe",
+        op_func=rocm_aiter_fused_moe_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_fused_moe_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+    direct_register_custom_op(
+        op_name="rocm_aiter_topk_softmax",
+        op_func=rocm_aiter_topk_softmax_impl,
+        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
+        fake_impl=rocm_aiter_topk_softmax_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+    direct_register_custom_op(
+        op_name="rocm_aiter_biased_grouped_topk",
+        op_func=rocm_aiter_biased_grouped_topk_impl,
+        mutates_args=["topk_weights", "topk_ids"],
+        fake_impl=rocm_aiter_biased_grouped_topk_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+def rocm_aiter_biased_group_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "sigmoid",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert scoring_func == "sigmoid", (
+        "rocm_aiter_biased_group_topk only supports 'sigmoid' scoring_func.")
+    assert e_score_correction_bias is not None, (
+        "'e_score_correction_bias' must not be None.")
+    token = hidden_states.shape[0]
+    device = hidden_states.device
+    topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+    topk_weights = torch.empty((token, topk),
+                               dtype=torch.float32,
+                               device=device)
+    torch.ops.vllm.rocm_aiter_biased_grouped_topk(
+        gating_output,
+        e_score_correction_bias,
+        topk_weights,
+        topk_ids,
+        num_expert_group,
+        topk_group,
+        renormalize,
+    )
+    return topk_weights, topk_ids
+def rocm_aiter_fused_experts(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        per_channel_quant: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+    activation_method = (ActivationMethod.SILU
+                         if activation == "silu" else ActivationMethod.GELU)
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
+    # w8a8 per-channel quantization
+    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when"
+            " `apply_router_weight_on_input` is True")
+        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            a16=False,
+            per_tensor_quant_scale=None,
+            expert_mask=None,
+            activation_method=activation_method)
+    else:
+        quant_method = QuantMethod.NO.value
+        # w8a8 block-scaled
+        if block_shape is not None and use_fp8_w8a8:
+            assert not apply_router_weight_on_input, (
+                "apply_router_weight_on_input is\
+                not supported for block scaled moe")
+            assert w1_scale is not None
+            assert w2_scale is not None
+            quant_method = QuantMethod.BLOCK_128x128.value
+        elif use_fp8_w8a8:
+            # Currently only per tensor quantization method is enabled.
+            quant_method = QuantMethod.PER_TENSOR.value
+        if apply_router_weight_on_input:
+            assert (topk_weights.dim() == 2
+                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+            _, topk = topk_weights.shape
+            assert (
+                topk == 1
+            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+        return torch.ops.vllm.rocm_aiter_fused_moe(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            quant_method=quant_method,
+            activation_method=activation_method,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            doweight_stage1=apply_router_weight_on_input)
+def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
+                            topk_indices: torch.Tensor,
+                            token_expert_indices: torch.Tensor,
+                            gating_output: torch.Tensor,
+                            renormalize: bool) -> tuple[torch.Tensor, ...]:
+    torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
+                                           token_expert_indices, gating_output,
+                                           renormalize)
+    return topk_weights, topk_indices
+def shuffle_weights(
+    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
+) -> tuple[torch.Tensor, ...]:
+    """
+    Applies shuffle_weight function from AITER to each
+    input tensor and returns them.
+    Rearranges (shuffles) the input tensor/s
+    into a specified block layout for optimized computation.
+    Args:
+        *tensors: Variable number of torch.Tensor objects.
+        layout: A pair of integers specifying the
+        block sizes used to divide the tensors during shuffling.
+        Default is (16, 16).
+    Returns:
+    A Tuple of shuffled tensors.
+    """
+    from aiter.ops.shuffle import shuffle_weight
+    return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)

vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py ADDED Viewed

@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape)
+from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(self,
+                 use_fp8_w8a8: bool = False,
+                 use_int8_w8a8: bool = False,
+                 use_int8_w8a16: bool = False,
+                 use_int4_w4a16: bool = False,
+                 per_channel_quant: bool = False,
+                 block_shape: Optional[list[int]] = None,
+                 block_m: Optional[int] = None,
+                 allow_deep_gemm: bool = False):
+        super().__init__()
+        self.triton_expert = TritonExperts(use_fp8_w8a8=use_fp8_w8a8,
+                                           use_int8_w8a8=use_int8_w8a8,
+                                           use_int4_w4a16=use_int4_w4a16,
+                                           use_int8_w8a16=use_int8_w8a16,
+                                           per_channel_quant=per_channel_quant,
+                                           block_shape=block_shape,
+                                           block_m=block_m)
+        self.deep_gemm_expert = DeepGemmExperts()
+        self.allow_deep_gemm = allow_deep_gemm
+        self.use_fp8_w8a8 = use_fp8_w8a8
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+    ) -> tuple[int, int, torch.dtype]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
+            return self.deep_gemm_expert.workspace_shapes(
+                a, M, N, K, topk, num_experts)
+        else:
+            return self.triton_expert.workspace_shapes(a, M, N, K, topk,
+                                                       num_experts)
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        N = w1.size(1)
+        if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512
+                and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+            return self.deep_gemm_expert.apply(
+                hidden_states,
+                w1,
+                w2,
+                topk_ids,
+                activation,
+                global_num_experts,
+                expert_map,
+                w1_scale,
+                w2_scale,
+                w1_zp,
+                w2_zp,
+                a1q_scale,
+                a2_scale,
+                workspace13,
+                workspace2,
+                expert_num_tokens,
+            )
+        else:
+            return self.triton_expert.apply(
+                hidden_states,
+                w1,
+                w2,
+                topk_ids,
+                activation,
+                global_num_experts,
+                expert_map,
+                w1_scale,
+                w2_scale,
+                w1_zp,
+                w2_zp,
+                a1q_scale,
+                a2_scale,
+                workspace13,
+                workspace2,
+                expert_num_tokens,
+            )

vllm/model_executor/layers/fused_moe/utils.py ADDED Viewed

@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+from math import prod
+from typing import Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_group_quant_int8, per_token_quant_int8)
+from vllm.utils import cdiv
+def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
+    """
+    Shrink the given tensor and apply the given view to it.  This is
+    used to resize the intermediate fused_moe caches.
+    """
+    assert prod(
+        v) <= x.numel(), f"{prod(v)} <= {x.numel()}"  # CUDAGRAPH unfriendly?
+    return x.flatten()[:prod(v)].view(*v)
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = ops.scaled_fp8_quant(
+            A, A_scale, use_per_token_if_dynamic=per_act_token)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.size(-1), block_k) == A_scale.size(-1)
+    return A, A_scale
+def _int8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform int8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    # If weights are per-channel (per_channel_quant=True), then
+    # activations apply per-token quantization. Otherwise, assume
+    # activation tensor-wise fp8/int8 quantization, dynamic or static
+    if block_shape is None:
+        assert per_act_token, \
+            "int8 quantization only supports block or channel-wise"
+        A, A_scale = per_token_quant_int8(A)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_int8(A, block_k)
+        assert cdiv(A.size(-1), block_k) == A_scale.size(-1)
+    return A, A_scale
+def moe_kernel_quantize_input(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    qtype: Optional[torch.dtype],
+    per_channel_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if qtype == torch.float8_e4m3fn:
+        return _fp8_quantize(A, A_scale, per_channel_quant, block_shape)
+    elif qtype == torch.int8:
+        return _int8_quantize(A, A_scale, per_channel_quant, block_shape)
+    else:
+        assert A_scale is None
+        return A, A_scale
+def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    A permutation routine that works on fp8 types.
+    """
+    if torch.is_floating_point(m) and m.dtype.itemsize == 1:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]