PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py ADDED Viewed

@@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.forward_context import get_forward_context, is_forward_context_available
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate,
+)
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
+    fp8_m_grouped_gemm_nt_masked,
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+)
+from vllm.utils.math_utils import cdiv, round_up
+logger = init_logger(__name__)
+def scales_shape_stride_dtype(
+    E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT
+) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]:
+    shape = (E, T, G)
+    strides = (T * G, 1, T)
+    if quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]:
+        return shape, strides, torch.float32
+    assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+    shape = (E, T, cdiv(G, 4))
+    strides = (T * cdiv(G, 4), 1, T)
+    return shape, strides, torch.int32
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+    # Pointers ------------------------------------------------------------
+    input_ptr,  # 16-bit activations (E, T, 2*H)
+    y_q_ptr,  # fp8 quantized activations (E, T, H)
+    y_s_ptr,  # 16-bit scales (E, T, G)
+    counts_ptr,  # int32 num tokens per expert (E)
+    # Sizes ---------------------------------------------------------------
+    H: tl.constexpr,  # hidden dimension (per output)
+    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
+    # Strides for input (elements) ---------------------------------------
+    stride_i_e,
+    stride_i_t,
+    stride_i_h,
+    # Strides for y_q (elements) -----------------------------------------
+    stride_yq_e,
+    stride_yq_t,
+    stride_yq_h,
+    # Strides for y_s (elements) -----------------------------------------
+    stride_ys_e,
+    stride_ys_t,
+    stride_ys_g,
+    # Stride for counts (elements)
+    stride_counts_e,
+    # Numeric params ------------------------------------------------------
+    eps: tl.constexpr,
+    fp8_min: tl.constexpr,
+    fp8_max: tl.constexpr,
+    ceil_ue8m0: tl.constexpr,
+    # Meta ---------------------------------------------------------------
+    BLOCK: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    G = H // GROUP_SIZE
+    # map program id -> (e, g)
+    pid = tl.program_id(0)
+    e = pid // G
+    g = pid % G
+    e = e.to(tl.int64)
+    g = g.to(tl.int64)
+    # number of valid tokens for this expert
+    n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+    cols = tl.arange(0, BLOCK).to(tl.int64)
+    mask = cols < BLOCK
+    base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+    base_gate_offset = base_input_offset + cols * stride_i_h
+    base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+    base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
+    base_ys_offset = e * stride_ys_e + g * stride_ys_g
+    for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+        gate = tl.load(
+            input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+        ).to(tl.float32)
+        up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
+        gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+        y = gate * up
+        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+        if ceil_ue8m0:
+            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
+        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+        tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
+def persistent_masked_m_silu_mul_quant(
+    y: torch.Tensor,  # (E, T, 2*H)
+    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
+    num_parallel_tokens=16,
+    group_size: int = 128,
+    quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+    y has shape (E, T, 2*H). The first half of the last dimension is
+    silu-activated, multiplied by the second half, then quantized into FP8.
+    We launch a fixed grid of threads to accommodate CUDA graphs. Let `P2`
+    be a parallelization factor for persistent_masked_m_silu_mul_quant over the
+    hidden dimension.
+    Let `expert_offsets = [0] + [num_tokens.cumsum()]` and
+    `total_tokens = expert_offsets[-1]`.
+    persistent_masked_m_silu_mul_quant launches `total_tokens x P2` number of
+    thread blocks. Each thread block contains `NUM_WARPS` warps.
+    Every thread block needs to find it's corresponding expert by warp-parallel scanning
+    over the `expert_offsets` array.
+    The i-th warp in the first thread block processes
+    `[i * warp_chunk_size, (i + 1) * warp_chunk_size]` groups
+    sequentially, where `warp_chunk_size = ((H / GROUP_SIZE) / P2) / NUM_WARPS`,
+    pipelining loads and computes.
+    The shared memory layout for 4 warps with a 2-stage pipeline for SiLU V2
+    can is visualized like so:
+                         stage0                              stage1
+    ┌─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┐
+    │gate0│up0│gate1│up1│gate2│up2│gate3│up3│gate0│up0│gate1│up1│gate2│up2│gate3│up3│
+    └─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┘
+    with the main difference between V1 and V2 being the global load
+    stride between warps, and between half-warps. Regarding the latter stride,
+    we assign the first half warp of every warp for `gate` loads and the second
+    half-warp to `up` loads.
+    Returns `(y_q, y_s)` where
+    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+    * `y_s` depends on quant_scale_fmt,
+      - quant_scale_fmt == FLOAT32,
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0,
+         `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0_FLOAT32_SPARSE
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    Let NUM_WARPS be the number of warps in a single thread block and
+    `GROUP_SIZE = 128` be the size of the quantization group.
+    """
+    assert y.ndim == 3, "y must be (E, T, 2*H)"
+    E, T, H2 = y.shape
+    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+    H = H2 // 2
+    G = (H + group_size - 1) // group_size
+    assert H % 8 == 0, "H must be divisible by 8"
+    assert group_size == 128, "H must be divisible by 8"
+    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E
+    tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
+    fp8_dtype = torch.float8_e4m3fn
+    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+    ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt)
+    y_s = torch.empty_strided(
+        ys_shape,
+        ys_strides,
+        dtype=ys_dtype,
+        device=y.device,
+    )
+    ceil_ue8m0 = quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
+    cuda_arch = current_platform.get_device_capability(
+        device_id=y.device.index
+    ).to_int()
+    if cuda_arch >= 80:
+        torch.ops._C.persistent_masked_m_silu_mul_quant(
+            y, tokens_per_expert, y_q, y_s, ceil_ue8m0
+        )
+    else:
+        stride_cnt_e = tokens_per_expert.stride()[0]
+        # Static grid over experts and H-groups.
+        # A loop inside the kernel handles the token dim
+        grid = (E * G,)
+        # strides (elements)
+        stride_i_e, stride_i_t, stride_i_h = y.stride()
+        stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+        f_info = torch.finfo(fp8_dtype)
+        fp8_max = f_info.max
+        fp8_min = f_info.min
+        eps: float = 1e-10
+        assert y_s.dtype == torch.float32, (
+            "_silu_mul_fp8_quant_deep_gemm does"
+            "not support {y_s.dtype} scales. Only torch.float32 supported."
+        )
+        _silu_mul_fp8_quant_deep_gemm[grid](
+            y,
+            y_q,
+            y_s,
+            tokens_per_expert,
+            H,
+            group_size,
+            stride_i_e,
+            stride_i_t,
+            stride_i_h,
+            stride_yq_e,
+            stride_yq_t,
+            stride_yq_h,
+            ys_strides[0],
+            ys_strides[1],
+            ys_strides[2],
+            stride_cnt_e,
+            eps,
+            fp8_min,
+            fp8_max,
+            ceil_ue8m0,
+            BLOCK=group_size,
+            NUM_STAGES=4,
+            num_warps=1,
+        )
+    return y_q, y_s
+class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        max_num_tokens: int,
+        num_dispatchers: int,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        """
+        max_num_tokens: Maximum number of tokens from a DP Rank
+        num_dispatchers: The number of DP dispatchers.
+        quant_config: Quantization configuration
+        """
+        super().__init__(quant_config)
+        assert self.block_shape == get_mk_alignment_for_contiguous_layout()
+        assert self.quant_config.use_fp8_w8a8
+        self.max_num_tokens = max_num_tokens
+        self.num_dispatchers = num_dispatchers
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
+    def supports_chunking(self) -> bool:
+        return False
+    def supports_expert_map(self) -> bool:
+        return False
+    def supports_packed_ue8m0_act_scales(self) -> bool:
+        """
+        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
+        """
+        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # FIXME (varun): We should be able to dispatch only from the leader
+        # DP ranks in the case of TP > 1. At the moment, all the Ranks
+        # end up sending their tokens. This needs to be fixed.
+        num_dispatchers = self.num_dispatchers
+        num_experts = local_num_experts
+        max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
+        workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
+        workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2))
+        output = (num_experts, max_num_tokens * num_dispatchers, K)
+        return (workspace13, workspace2, output)
+    def estimate_expected_m(
+        self, global_num_experts: int, max_tokens_per_expert: int, topk: int
+    ) -> int:
+        dp_meta = (
+            get_forward_context().dp_metadata
+            if is_forward_context_available()
+            else None
+        )
+        if dp_meta is None:
+            logger.warning_once(
+                "DPMetadata unavailable. Defaulting expected_m to "
+                f"{max_tokens_per_expert}.",
+                scope="local",
+            )
+            return max_tokens_per_expert
+        total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item()
+        total_num_tokens_replicated = total_num_tokens * topk
+        # Assume even load balancing
+        assert global_num_experts != 0
+        estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16)
+        # clamp estimate
+        estimate = max(estimate, 16)
+        estimate = min(max_tokens_per_expert, estimate)
+        return estimate
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        assert expert_tokens_meta is not None
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+        assert hidden_states.ndim == 3
+        assert self.block_shape is not None
+        a1q = hidden_states
+        _, N, K = w1.size()
+        assert w2.size(1) == K
+        E, max_num_tokens, N, K, _ = self.moe_problem_size(
+            hidden_states, w1, w2, topk_ids
+        )
+        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
+        expected_m = self.estimate_expected_m(
+            global_num_experts=global_num_experts,
+            max_tokens_per_expert=max_num_tokens,
+            topk=topk_ids.size(-1),
+        )
+        fp8_m_grouped_gemm_nt_masked(
+            (a1q, a1q_scale),
+            (w1, self.w1_scale),
+            workspace1,
+            expert_num_tokens,
+            expected_m,
+        )
+        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
+        a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
+            workspace1,
+            expert_num_tokens,
+            quant_scale_fmt=quant_scale_fmt,
+        )
+        fp8_m_grouped_gemm_nt_masked(
+            (a2q, a2q_scale),
+            (w2, self.w2_scale),
+            output,
+            expert_num_tokens,
+            expected_m,
+        )

vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py ADDED Viewed

@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts,
+)
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
+from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout
+class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        max_num_tokens: int,
+        num_dispatchers: int,
+        quant_config: FusedMoEQuantConfig,
+        allow_deep_gemm: bool = False,
+    ):
+        super().__init__(quant_config)
+        self.batched_triton_experts = BatchedTritonExperts(
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=num_dispatchers,
+            quant_config=self.quant_config,
+        )
+        self.allow_deep_gemm = (
+            allow_deep_gemm
+            and self.quant_config.use_fp8_w8a8
+            and self.block_shape == get_mk_alignment_for_contiguous_layout()
+        )
+        self.batched_deep_gemm_experts = (
+            BatchedDeepGemmExperts(
+                max_num_tokens=max_num_tokens,
+                num_dispatchers=num_dispatchers,
+                quant_config=self.quant_config,
+            )
+            if self.allow_deep_gemm
+            else None
+        )
+        assert (
+            self.batched_deep_gemm_experts is not None
+            or self.batched_triton_experts is not None
+        )
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        if self.batched_triton_experts is not None:
+            assert (
+                self.batched_deep_gemm_experts is None
+                or self.batched_deep_gemm_experts.activation_formats
+                == self.batched_triton_experts.activation_formats
+            )
+            return self.batched_triton_experts.activation_formats
+        else:
+            assert self.batched_deep_gemm_experts is not None
+            return self.batched_deep_gemm_experts.activation_formats
+    def supports_chunking(self) -> bool:
+        bdge = self.batched_deep_gemm_experts
+        bte = self.batched_triton_experts
+        return (bdge is None or bdge.supports_chunking()) and (
+            bte is None or bte.supports_chunking()
+        )
+    def supports_expert_map(self) -> bool:
+        bdge = self.batched_deep_gemm_experts
+        bte = self.batched_triton_experts
+        return (bdge is None or bdge.supports_expert_map()) and (
+            bte is None or bte.supports_expert_map()
+        )
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        bdge = self.batched_deep_gemm_experts
+        bte = self.batched_triton_experts
+        bdge_war = bdge.finalize_weight_and_reduce_impl() if bdge else None
+        bte_war = bte.finalize_weight_and_reduce_impl() if bte else None
+        is_bdge_war = bdge_war is not None
+        is_bte_war = bte_war is not None
+        if is_bdge_war and is_bte_war:
+            assert bdge_war == bte_war, (
+                "Both implementations should agree on WeightAndReduce impls. "
+                f"Got bdge_war: {bdge_war}, and bte_war: {bte_war}"
+            )
+        if bdge_war is not None:
+            return bdge_war
+        assert bte_war is not None
+        return bte_war
+    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
+        return act_dtype
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_metadata: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm:
+            assert self.batched_deep_gemm_experts is not None
+            return self.batched_deep_gemm_experts.workspace_shapes(
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_metadata,
+            )
+        else:
+            assert self.batched_triton_experts is not None
+            return self.batched_triton_experts.workspace_shapes(
+                M,
+                N,
+                K,
+                topk,
+                global_num_experts,
+                local_num_experts,
+                expert_tokens_metadata,
+            )
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        experts = (
+            self.batched_deep_gemm_experts
+            if self.allow_deep_gemm
+            else self.batched_triton_experts
+        )
+        assert experts is not None
+        experts.apply(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+            expert_map,
+            a1q_scale,
+            a2_scale,
+            workspace13,
+            workspace2,
+            expert_tokens_meta,
+            apply_router_weight_on_input,
+        )