PyPI - vllm-cpu-avx512vnni - Versions diffs - 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-avx512vnni 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1395) hide show

vllm/model_executor/layers/fused_moe/routing_simulator.py ADDED Viewed

@@ -0,0 +1,291 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Token-to-Expert Routing Simulator
+This module provides a framework for simulating and testing different
+token-to-expert routing strategies for Mixture of Experts (MoE) models.
+It supports routing logic customization and includes example implementations
+like uniform random routing.
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+import torch
+class RoutingStrategy(ABC):
+    """Base class for token-to-expert routing strategies."""
+    @abstractmethod
+    def route_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        indices_type: Optional[torch.dtype] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Route tokens to experts.
+        Args:
+            hidden_states: Input hidden states [num_tokens, hidden_size]
+            router_logits: Router logits [num_tokens, num_experts]
+            top_k: Number of experts to select per token
+            indices_type: Data type for expert indices
+        Returns:
+            tuple of (topk_weights, topk_ids)
+        """
+        pass
+class DistributionBasedRouting(RoutingStrategy):
+    """
+    Distribution-based random routing strategy with configurable distributions.
+    This routing strategy randomly selects experts for each token based on
+    different probability distributions. Currently supports uniform and normal
+    distributions for testing different routing patterns.
+    """
+    def __init__(self,
+                 distribution: str = "uniform",
+                 **distribution_params: Any):
+        """
+        Initialize distribution-based routing.
+        Args:
+            distribution: Type of distribution to use for sampling
+                - "uniform": Uniform distribution (default)
+                - "normal": Normal/Gaussian distribution
+            **distribution_params: Parameters specific to the
+                chosen distribution
+                For "uniform": No additional parameters needed
+                For "normal": mean (default: 0.0), std (default: 1.0)
+        """
+        self.distribution = distribution.lower()
+        self.distribution_params = distribution_params
+        # Validate distribution and parameters
+        self._validate_distribution_params()
+    def _validate_distribution_params(self):
+        """Validate distribution type and parameters."""
+        valid_distributions = ["uniform", "normal"]
+        if self.distribution not in valid_distributions:
+            raise ValueError(f"Unsupported distribution: {self.distribution}. "
+                             f"Supported distributions: {valid_distributions}")
+        # Set default parameters if not provided
+        if self.distribution == "normal":
+            self.distribution_params.setdefault("mean", 0.0)
+            self.distribution_params.setdefault("std", 1.0)
+    def route_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        indices_type: Optional[torch.dtype] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Randomly select experts for each token using the specified distribution.
+        Args:
+            hidden_states: Input hidden states [num_tokens, hidden_size]
+            router_logits: Router logits [num_tokens, num_experts]
+            top_k: Number of experts to select per token
+            indices_type: Data type for expert indices
+        Returns:
+            tuple of (topk_weights, topk_ids) where:
+            - topk_weights: Weights based on distribution sampling
+            - topk_ids: Expert indices sampled from the distribution
+        """
+        num_tokens = hidden_states.shape[0]
+        num_experts = router_logits.shape[-1]
+        if indices_type is None:
+            indices_type = torch.long
+        # Generate expert IDs based on the specified distribution
+        topk_ids = self._sample_expert_ids(num_tokens, num_experts, top_k,
+                                           hidden_states.device, indices_type)
+        # Generate weights based on the distribution
+        topk_weights = self._generate_weights(num_tokens, top_k,
+                                              hidden_states.device)
+        return topk_weights, topk_ids
+    def _sample_expert_ids(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        top_k: int,
+        device: torch.device,
+        indices_type: torch.dtype,
+    ) -> torch.Tensor:
+        """Sample expert IDs based on the specified distribution."""
+        if self.distribution == "uniform":
+            # Uniform random sampling
+            return torch.randint(
+                low=0,
+                high=num_experts,
+                size=(num_tokens, top_k),
+                dtype=indices_type,
+                device=device,
+            )
+        elif self.distribution == "normal":
+            # For normal distribution, sample continuous values and map to
+            # expert IDs
+            continuous_samples = self._sample_continuous_distribution(
+                num_tokens, top_k, device)
+            # Map continuous samples to expert indices
+            # Normalize to [0, 1] range and scale to [0, num_experts)
+            normalized_samples = self._normalize_samples(continuous_samples)
+            expert_ids = (normalized_samples * num_experts).long()
+            expert_ids = torch.clamp(expert_ids, 0, num_experts - 1)
+            return expert_ids.to(dtype=indices_type)
+        else:
+            raise ValueError(f"Unsupported distribution: {self.distribution}")
+    def _sample_continuous_distribution(self, num_tokens: int, top_k: int,
+                                        device: torch.device) -> torch.Tensor:
+        """Sample from continuous distributions."""
+        shape = (num_tokens, top_k)
+        if self.distribution == "normal":
+            mean = self.distribution_params["mean"]
+            std = self.distribution_params["std"]
+            return torch.normal(mean, std, size=shape, device=device)
+        else:
+            raise ValueError(
+                f"Unsupported continuous distribution: {self.distribution}")
+    def _normalize_samples(self, samples: torch.Tensor) -> torch.Tensor:
+        """Normalize samples to [0, 1] range."""
+        if self.distribution == "normal":
+            # Use sigmoid to map normal distribution to [0, 1]
+            return torch.sigmoid(samples)
+        else:
+            raise ValueError(f"Unsupported distribution for normalization: "
+                             f"{self.distribution}")
+    def _generate_weights(self, num_tokens: int, top_k: int,
+                          device: torch.device) -> torch.Tensor:
+        """Generate weights based on the distribution."""
+        if self.distribution == "uniform":
+            # All-ones weights for uniform distribution
+            return torch.ones(
+                (num_tokens, top_k),
+                dtype=torch.float32,
+                device=device,
+            )
+        elif self.distribution == "normal":
+            # For normal distribution, generate weights from the same
+            # distribution
+            continuous_weights = self._sample_continuous_distribution(
+                num_tokens, top_k, device)
+            # Normalize to positive values and sum to 1
+            weights = torch.abs(continuous_weights)
+            weights = weights / weights.sum(dim=-1, keepdim=True)
+            return weights
+        else:
+            raise ValueError(
+                f"Unsupported distribution for weight generation: "
+                f"{self.distribution}")
+    def get_distribution_info(self) -> dict:
+        """Get information about the current distribution configuration."""
+        return {
+            "distribution": self.distribution,
+            "parameters": self.distribution_params.copy()
+        }
+class RoutingSimulator:
+    """
+    Token-to-Expert Routing Simulator.
+    This class provides a framework for testing and comparing different
+    routing strategies for MoE models. It can simulate routing behavior
+    and collect statistics for analysis.
+    """
+    # Class-level registry of routing strategies
+    _routing_strategies: dict[str, RoutingStrategy] = {
+        # Basic routing strategies
+        "uniform_random":
+        DistributionBasedRouting(distribution="uniform", mean=0.0, std=1.0),
+        "normal_routing":
+        DistributionBasedRouting(distribution="normal", mean=0.0, std=1.0),
+    }
+    @classmethod
+    def register_strategy(cls, name: str, strategy: RoutingStrategy):
+        """
+        Register a custom routing strategy.
+        Args:
+            name: Name of the strategy
+            strategy: RoutingStrategy instance
+        """
+        cls._routing_strategies[name] = strategy
+    @classmethod
+    def get_available_strategies(cls) -> list[str]:
+        """
+        Get list of available routing strategy names.
+        Returns:
+            List of available strategy names
+        """
+        return list(cls._routing_strategies.keys())
+    @staticmethod
+    def simulate_routing(
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        strategy_name: str,
+        top_k: int,
+        indices_type: Optional[torch.dtype] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Simulate token-to-expert routing using the specified strategy.
+        Args:
+            hidden_states: Input hidden states [num_tokens, hidden_size]
+            router_logits: Router logits [num_tokens, num_experts]
+            strategy_name: Name of the routing strategy to use
+            top_k: Number of experts to select per token
+            indices_type: Data type for expert indices
+        Returns:
+            tuple of (topk_weights, topk_ids)
+        """
+        if strategy_name not in RoutingSimulator._routing_strategies:
+            raise ValueError(
+                f"Unknown routing strategy: {strategy_name}. "
+                f"Available strategies: "
+                f"{list(RoutingSimulator._routing_strategies.keys())}")
+        strategy = RoutingSimulator._routing_strategies[strategy_name]
+        return strategy.route_tokens(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            top_k=top_k,
+            indices_type=indices_type,
+        )

vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py ADDED Viewed

@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+import vllm._custom_ops as ops
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce):
+    """
+    Useful in the case when some FusedMoEPermuteExpertsUnpermute
+    implementation does not perform weight application and reduction
+    but cannot address the needs of all the compatible PrepareAndFinalize
+    implementations.
+    For example, BatchedTritonExperts is compatible with both
+    PplxPrepareAndFinalize and BatchedPrepareAndFinalize. PplxPrepareAndFinalize
+    does the weight-application + reduction as part of the pplx combine kernel.
+    But the BatchedPrepareAndFinalize needs an implementation. To facilitate
+    this case, the BatchedTritonExperts could use TopKWeightAndReduceDelegate
+    so the PrepareAndFinalize implementations could choose how to
+    weight + reduce.
+    """
+    def __eq__(self, other):
+        return isinstance(other, TopKWeightAndReduceDelegate)
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        raise RuntimeError("The caller is expected to choose an appropriate "
+                           "TopKWeightAndReduce implementation.")
+class TopKWeightAndReduceNoOP(mk.TopKWeightAndReduce):
+    """
+    The fused_experts outputs have already been weight applied and reduced.
+    This implementation is a no-op.
+    """
+    def __eq__(self, other):
+        return isinstance(other, TopKWeightAndReduceNoOP)
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        # Weight application and reduction operations are already done.
+        if output is None:
+            return fused_expert_output
+        # MoEPrepareAndFinalizeNoEP needs the output to be in the `output`
+        # tensor.
+        assert output.size() == fused_expert_output.size(), (
+            "output shape is expected to match the fused_expert_output shape. "
+            f"But got output={output.size()}, "
+            f"used_expert_output={fused_expert_output.size()}")
+        output.copy_(fused_expert_output, non_blocking=True)
+        return output
+class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce):
+    """
+    TopKWeightAndReduce implementation for a fused_experts output
+    of shape (m, topk, K)
+    """
+    def __eq__(self, other):
+        return isinstance(other, TopKWeightAndReduceContiguous)
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        m, num_topk = topk_ids.size()
+        k = fused_expert_output.size(-1)
+        if fused_expert_output.ndim == 2:
+            fused_expert_output = fused_expert_output.view(m, num_topk, k)
+        assert fused_expert_output.size() == (m, num_topk, k), (
+            f"Expected fused_expert_output size {(m, num_topk, k)}. But got "
+            f"{fused_expert_output.size()}")
+        if not apply_router_weight_on_input:
+            fused_expert_output.mul_(topk_weights.view(m, -1, 1))
+        if output is None:
+            output = torch.empty((m, k),
+                                 device=fused_expert_output.device,
+                                 dtype=fused_expert_output.dtype)
+        assert output.size() == (m, k), (
+            f"Expected output size {(m, k)}. But got {output.size()}")
+        ops.moe_sum(fused_expert_output, output)
+        return output
+class TopKWeightAndReduceNaiveBatched(mk.TopKWeightAndReduce):
+    """
+    TopKWeightAndReduce implementation for a fused_experts output
+    of shape (num_experts, batch_size, K)
+    """
+    def __init__(self, rank: int):
+        self.rank = rank
+    def __eq__(self, other):
+        return (isinstance(other, TopKWeightAndReduceNaiveBatched)
+                and (other.rank == self.rank))
+    def apply(self, output: Optional[torch.Tensor],
+              fused_expert_output: torch.Tensor, topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              apply_router_weight_on_input: bool) -> torch.Tensor:
+        assert fused_expert_output.ndim == 3
+        num_tokens = topk_ids.size(0)
+        num_local_experts = fused_expert_output.size(0)
+        K = fused_expert_output.size(-1)
+        if output is None:
+            output = torch.zeros((num_tokens, K),
+                                 device=fused_expert_output.device,
+                                 dtype=fused_expert_output.dtype)
+        else:
+            output.fill_(0)
+        assert output.size() == (num_tokens, K), (
+            f"Expected output size {(num_tokens, K)}, but got {output.size()}")
+        first_expert = num_local_experts * self.rank
+        last_expert = first_expert + num_local_experts
+        for expert_id in range(first_expert, last_expert):
+            matching_tokens = topk_ids == expert_id
+            topks = torch.any(matching_tokens, dim=1).flatten()
+            rows = torch.count_nonzero(topks)
+            rhs = fused_expert_output[expert_id - first_expert, :rows, :]
+            if not apply_router_weight_on_input:
+                rhs.mul_(topk_weights[matching_tokens].view(rhs.size(0), 1))
+            output[topks] = output[topks] + rhs
+        return output

vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py ADDED Viewed

@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape,
+    deep_gemm_block_shape)
+from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_act_token_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+        allow_deep_gemm: bool = False,
+    ):
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                use_mxfp4_w4a4=use_mxfp4_w4a4,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        self.triton_expert = TritonExperts(
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            use_mxfp4_w4a4=use_mxfp4_w4a4,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
+        )
+        self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8 and
+                                self.block_shape == deep_gemm_block_shape())
+        self.deep_gemm_expert = DeepGemmExperts(
+        ) if self.allow_deep_gemm else None
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        assert (self.deep_gemm_expert is None
+                or self.triton_expert.activation_formats
+                == self.deep_gemm_expert.activation_formats)
+        return self.triton_expert.activation_formats
+    def supports_chunking(self) -> bool:
+        dge = self.deep_gemm_expert
+        te = self.triton_expert
+        return ((dge is None or dge.supports_chunking())
+                and (te is None or te.supports_chunking()))
+    def supports_expert_map(self) -> bool:
+        dge = self.deep_gemm_expert
+        te = self.triton_expert
+        return ((dge is None or dge.supports_expert_map())
+                and (te is None or te.supports_expert_map()))
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        dge = self.deep_gemm_expert
+        te = self.triton_expert
+        dge_war = dge.finalize_weight_and_reduce_impl() if dge else None
+        te_war = te.finalize_weight_and_reduce_impl() if te else None
+        is_dge_war = dge_war is not None
+        is_te_war = te_war is not None
+        if is_dge_war and is_te_war:
+            assert dge_war == te_war, (
+                "Both implementations should agree on WeightAndReduce impls. "
+                f"Got dge_war: {dge_war}, and te_war: {te_war}")
+        if dge_war is not None:
+            return dge_war
+        assert te_war is not None
+        return te_war
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # Note: the deep gemm workspaces are strictly larger than the triton
+        # workspaces so we can be pessimistic here and allocate for DeepGemm
+        # even if we fall back to triton later, e.g. if expert maps are set.
+        if self.allow_deep_gemm and (is_deep_gemm_e8m0_used()
+                                     or _valid_deep_gemm_shape(M, N, K)):
+            assert self.deep_gemm_expert is not None
+            return self.deep_gemm_expert.workspace_shapes(
+                a, aq, M, N, K, topk, global_num_experts, local_num_experts,
+                expert_tokens_meta)
+        else:
+            return self.triton_expert.workspace_shapes(a, aq, M, N, K, topk,
+                                                       global_num_experts,
+                                                       local_num_experts,
+                                                       expert_tokens_meta)
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        use_deep_gemm = (self.allow_deep_gemm
+                         and (_valid_deep_gemm(hidden_states, w1, w2)
+                              or is_deep_gemm_e8m0_used()))
+        experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
+        assert experts is not None
+        experts.apply(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation,
+            global_num_experts,
+            expert_map,
+            w1_scale,
+            w2_scale,
+            w1_zp,
+            w2_zp,
+            a1q_scale,
+            a2_scale,
+            workspace13,
+            workspace2,
+            expert_tokens_meta,
+            apply_router_weight_on_input,
+        )