PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py ADDED Viewed

@@ -0,0 +1,362 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.distributed import get_dp_group, get_ep_group
+from vllm.distributed.device_communicators.base_device_communicator import (
+    All2AllManagerBase,
+)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP,
+)
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
+from vllm.utils.flashinfer import nvfp4_block_scale_interleave
+def get_local_sizes():
+    return get_forward_context().dp_metadata.get_chunk_sizes_across_dp_rank()
+class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    """Base class for FlashInfer MoE prepare and finalize operations."""
+    def __init__(
+        self,
+        use_dp: bool,
+        num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
+    ):
+        super().__init__()
+        self.num_dispatchers_ = num_dispatchers
+        self.use_dp = use_dp
+        self.local_tokens = None
+        # Toggle for DeepSeek-style FP8 block-scale path where activations are
+        # not quantized here and weight block scales are consumed by the kernel.
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+    def max_num_tokens_per_rank(self) -> int | None:
+        return None
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return None
+    def num_dispatchers(self) -> int:
+        return self.num_dispatchers_
+    def output_is_reduced(self) -> bool:
+        return False
+    def _apply_router_weight_on_input(
+        self,
+        a1: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> None:
+        """Apply router weight on input if needed."""
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1"
+            )
+            a1.mul_(topk_weights.to(a1.dtype))
+class FlashInferAllToAllMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
+    """FlashInfer implementation using AllToAll communication."""
+    def __init__(
+        self,
+        use_dp: bool,
+        num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
+    ):
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
+        self.alltoall_info = None
+        # Initialize all2all_manager only for DP case
+        self.all2all_manager = None
+        if self.use_dp:
+            self.all2all_manager = get_ep_group().device_communicator.all2all_manager
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        self._apply_router_weight_on_input(
+            a1, topk_weights, topk_ids, apply_router_weight_on_input
+        )
+        if not self.use_dp:
+            # Non-DP case: quantize activations unless using block-scale path
+            if not self.use_deepseek_fp8_block_scale:
+                a1q, a1q_scale = moe_kernel_quantize_input(
+                    a1,
+                    quant_config.a1_gscale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                    is_fp4_scale_swizzled=not self.use_dp,
+                )
+            else:
+                a1q = a1
+                a1q_scale = None
+        else:
+            # DP case: use FlashInfer AllToAll
+            global_num_tokens_cpu = get_local_sizes()
+            top_k = topk_ids.size(1)
+            (self.alltoall_info, topk_ids, topk_weights, a1q, a1q_scale) = (
+                flashinfer_alltoall_dispatch(
+                    self.all2all_manager,
+                    global_num_tokens_cpu,
+                    a1,
+                    quant_config.a1_gscale,
+                    topk_ids,
+                    topk_weights,
+                    top_k,
+                    num_experts,
+                    quant_config,
+                    use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
+                )
+            )
+        return a1q, a1q_scale, None, topk_ids, topk_weights
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
+        if self.use_dp:
+            top_k = topk_ids.size(1)
+            token_count = output.shape[0]
+            fused_expert_output = flashinfer_alltoall_combine(
+                self.all2all_manager,
+                fused_expert_output,
+                top_k=top_k,
+                token_count=token_count,
+                alltoall_info=self.alltoall_info,
+            )
+        output.copy_(fused_expert_output)
+class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
+    def __init__(
+        self,
+        use_dp: bool,
+        num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
+    ):
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        self._apply_router_weight_on_input(
+            a1, topk_weights, topk_ids, apply_router_weight_on_input
+        )
+        if not self.use_dp and quant_config.quant_dtype == "nvfp4":
+            return a1, None, None, topk_ids, topk_weights
+        if not self.use_deepseek_fp8_block_scale:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                quant_config.a1_gscale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+                is_fp4_scale_swizzled=not self.use_dp,
+            )
+        else:
+            # Block-scale path: pass activations through, omit per-token scales
+            a1q = a1
+            a1q_scale = None
+        if self.use_dp:
+            # Build gather list conditionally - omit a1q_scale if None
+            # (block-scale path)
+            gather_list = [topk_weights, topk_ids, a1q]
+            if a1q_scale is not None:
+                gather_list.append(a1q_scale)
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q, a1q_scale = gathered
+            else:
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q = gathered
+                a1q_scale = None
+        if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None:
+            a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
+        return a1q, a1q_scale, None, topk_ids, topk_weights
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
+        assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP)
+        if self.use_dp:
+            fused_expert_output = get_dp_group().reduce_scatterv(
+                fused_expert_output, dim=0, sizes=get_local_sizes()
+            )
+        output.copy_(fused_expert_output)
+def flashinfer_alltoall_dispatch(
+    all2all_manager: All2AllManagerBase,
+    global_num_tokens_cpu: list[int],
+    x: torch.Tensor,
+    gs: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+    quant_config: FusedMoEQuantConfig,
+    use_deepseek_fp8_block_scale: bool = False,
+):
+    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
+    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
+        "FlashInfer AllToAll workspace not available"
+    )
+    ep_rank = all2all_manager.rank
+    ep_size = all2all_manager.world_size
+    max_num_token = (
+        max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else x.shape[0]
+    )
+    orig_topk_weights_dtype = topk_weights.dtype
+    alltoall_info, topk_ids, topk_weights, _ = (
+        MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
+            topk_ids,
+            topk_weights,
+            None,
+            all2all_manager.prepare_workspace_tensor,
+            max_num_token,
+            ep_rank,
+            ep_size,
+            num_experts,
+            num_experts,
+            top_k,
+        )
+    )
+    topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
+    if not use_deepseek_fp8_block_scale:
+        x, x_sf = moe_kernel_quantize_input(
+            x,
+            gs,
+            quant_config.quant_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
+        )
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+        x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
+            x_sf,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+        if quant_config.quant_dtype == "nvfp4":
+            x_sf = nvfp4_block_scale_interleave(x_sf)
+    else:
+        # Block-scale path: pass activations through without quantization
+        x_sf = None
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+    return alltoall_info, topk_ids, topk_weights, x, x_sf
+def flashinfer_alltoall_combine(
+    all2all_manager: All2AllManagerBase,
+    output: torch.Tensor,
+    top_k: int,
+    token_count: int,
+    alltoall_info,
+):
+    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
+    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
+        "FlashInfer AllToAll workspace not available"
+    )
+    return MnnvlMoe.mnnvl_moe_alltoallv_combine(
+        output,
+        alltoall_info,
+        all2all_manager.workspace_tensor,
+        ep_rank=all2all_manager.rank,
+        ep_size=all2all_manager.world_size,
+        top_k=top_k,
+        token_count=token_count,
+    )
+def create_flashinfer_prepare_finalize(
+    use_dp: bool,
+    use_nvfp4: bool = False,
+    enable_alltoallv: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
+) -> FlashInferCutlassMoEPrepareAndFinalize:
+    """Factory function to create the appropriate FlashInfer implementation."""
+    if use_nvfp4:
+        if enable_alltoallv:
+            return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
+        else:
+            return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
+    # FP8 path currently supported via AllGather; optionally enable block-scale
+    return FlashInferAllGatherMoEPrepareAndFinalize(
+        use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )

vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py ADDED Viewed

@@ -0,0 +1,192 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    calculate_tile_tokens_dim,
+)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+)
+from vllm.utils.torch_utils import direct_register_custom_op
+def flashinfer_fused_moe_blockscale_fp8(
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor,
+    x: torch.Tensor,
+    w13_weight: torch.Tensor,
+    w13_weight_scale_inv: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w2_weight_scale_inv: torch.Tensor,
+    global_num_experts: int,
+    top_k: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    intermediate_size: int,
+    expert_offset: int,
+    local_num_experts: int,
+    block_shape: list[int],
+    routing_method_type: int = RoutingMethodType.DeepSeekV3,
+    routed_scaling: float | None = 1.0,
+) -> torch.Tensor:
+    from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe
+    topk_group = topk_group if topk_group is not None else 0
+    assert top_k <= global_num_experts
+    assert top_k <= 10
+    assert global_num_experts % 4 == 0
+    assert block_shape == [128, 128]
+    # Routing kernel expects #experts <= #threads 512
+    assert global_num_experts <= 512
+    a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1])
+    # NOTE: scales of hidden states have to be transposed!
+    a_sf_t = a_sf.t().contiguous()
+    return flashinfer_trtllm_fp8_block_scale_moe(
+        routing_logits=routing_logits,
+        routing_bias=routing_bias,
+        hidden_states=a_q,
+        hidden_states_scale=a_sf_t,
+        gemm1_weights=w13_weight,
+        gemm1_weights_scale=w13_weight_scale_inv,
+        gemm2_weights=w2_weight,
+        gemm2_weights_scale=w2_weight_scale_inv,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        n_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=intermediate_size,
+        local_expert_offset=expert_offset,
+        local_num_experts=local_num_experts,
+        routed_scaling_factor=routed_scaling,
+        tile_tokens_dim=None,
+        routing_method_type=routing_method_type,
+        use_shuffled_weight=False,
+    )
+def flashinfer_fused_moe_blockscale_fp8_fake(
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor,
+    x: torch.Tensor,
+    w13_weight: torch.Tensor,
+    w13_weight_scale_inv: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w2_weight_scale_inv: torch.Tensor,
+    global_num_experts: int,
+    top_k: int,
+    num_expert_group: int,
+    topk_group: int,
+    intermediate_size: int,
+    expert_offset: int,
+    local_num_experts: int,
+    block_shape: list[int],
+    routing_method_type: int,
+    routed_scaling: float = 1.0,
+) -> torch.Tensor:
+    return torch.empty_like(x)
+# TODO(bnell): Does this really need to be a torch.op?
+direct_register_custom_op(
+    op_name="flashinfer_fused_moe_blockscale_fp8",
+    op_func=flashinfer_fused_moe_blockscale_fp8,
+    fake_impl=flashinfer_fused_moe_blockscale_fp8_fake,
+    tags=(torch.Tag.needs_fixed_stride_order,),
+)
+def flashinfer_fused_moe_per_tensor_scale_fp8(
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor | None,
+    hidden_states: torch.Tensor,
+    input_scale: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    output1_scales_scalar: torch.Tensor,
+    output1_scales_gate_scalar: torch.Tensor,
+    output2_scales_scalar: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    intermediate_size: int,
+    local_expert_offset: int,
+    local_num_experts: int,
+    use_routing_scales_on_input: bool,
+    routing_method_type: int,
+    routed_scaling_factor: float = 1.0,
+) -> torch.Tensor:
+    num_expert_group = num_expert_group if num_expert_group is not None else 0
+    topk_group = topk_group if topk_group is not None else 0
+    quant_hidden_states, _ = moe_kernel_quantize_input(
+        hidden_states,
+        input_scale,
+        quant_dtype=torch.float8_e4m3fn,
+        per_act_token_quant=False,
+    )
+    from vllm.utils.flashinfer import flashinfer_trtllm_fp8_per_tensor_scale_moe
+    return flashinfer_trtllm_fp8_per_tensor_scale_moe(
+        routing_logits=routing_logits,
+        routing_bias=routing_bias,
+        hidden_states=quant_hidden_states,
+        gemm1_weights=gemm1_weights,
+        output1_scales_scalar=output1_scales_scalar,
+        output1_scales_gate_scalar=output1_scales_gate_scalar,
+        gemm2_weights=gemm2_weights,
+        output2_scales_scalar=output2_scales_scalar,
+        num_experts=num_experts,
+        top_k=top_k,
+        n_group=num_expert_group,
+        topk_group=topk_group,
+        intermediate_size=intermediate_size,
+        local_expert_offset=local_expert_offset,
+        local_num_experts=local_num_experts,
+        routed_scaling_factor=routed_scaling_factor,
+        use_routing_scales_on_input=use_routing_scales_on_input,
+        tile_tokens_dim=calculate_tile_tokens_dim(
+            hidden_states.shape[0], top_k, num_experts
+        ),
+        routing_method_type=routing_method_type,
+    )
+def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
+    routing_logits: torch.Tensor,
+    routing_bias: torch.Tensor | None,
+    hidden_states: torch.Tensor,
+    input_scale: torch.Tensor,
+    gemm1_weights: torch.Tensor,
+    gemm2_weights: torch.Tensor,
+    output1_scales_scalar: torch.Tensor,
+    output1_scales_gate_scalar: torch.Tensor,
+    output2_scales_scalar: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    intermediate_size: int,
+    local_expert_offset: int,
+    local_num_experts: int,
+    use_routing_scales_on_input: bool,
+    routing_method_type: int,
+    routed_scaling_factor: float = 1.0,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+# TODO(bnell): Does this really need to be a torch.op?
+direct_register_custom_op(
+    op_name="flashinfer_fused_moe_per_tensor_scale_fp8",
+    op_func=flashinfer_fused_moe_per_tensor_scale_fp8,
+    mutates_args=["hidden_states"],
+    fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake,
+    tags=(torch.Tag.needs_fixed_stride_order,),
+)