PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/model_executor/layers/fused_moe/moe_pallas.py ADDED Viewed

@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn.functional as F
+import torch_xla.experimental.custom_kernel  # noqa: F401
+def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
+    """
+  Compute the histogram of a int32 tensor. The bin edges are defined by the
+  min and max values, with step = 1.
+  """
+    assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
+    assert min <= max, "min must be less than or equal to max."
+    def searchsorted(sorted_sequence: torch.Tensor,
+                     values_to_search: torch.Tensor) -> torch.Tensor:
+        return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1)
+    bin_edges = torch.linspace(min, max, max - min + 1,
+                               dtype=input.dtype).to(input.device)
+    return searchsorted(bin_edges, input).to(torch.int32)
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+    renormalize: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+    """
+    assert expert_map is None, "expert_map is not supported for pallas MoE."
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    device = hidden_states.device
+    dtype = hidden_states.dtype
+    assert (num_tokens * topk) % 16 == 0, (
+        "The Pallas GMM kernel requires num_tokens * topk to be a multiple of "
+        f"16 but got {num_tokens * topk}")
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, topk_indices = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+    topk_indices = topk_indices.flatten()
+    topk_argsort_indices = topk_indices.argsort()
+    topk_argsort_revert_indices = topk_argsort_indices.argsort()
+    token_indices = torch.arange(num_tokens,
+                                 device=device).repeat_interleave(topk)
+    token_indices = token_indices[topk_argsort_indices]
+    group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1)
+    x = hidden_states[token_indices]
+    x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True)
+    x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:]
+    x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True)
+    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
+    x = x * topk_weights.unsqueeze(dim=-1)
+    x = x.sum(dim=-2)
+    x = x.reshape(orig_shape)
+    return x

vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py ADDED Viewed

@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.utils import _fp8_perm
+def _moe_permute(
+    curr_hidden_states: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
+    curr_topk_ids: torch.Tensor,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    block_m: int,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           torch.Tensor]:
+    """
+    Determine the sorted_token_ids, expert_ids for the given problem size.
+    Permute the hidden states and scales according to `sorted_token_ids`.
+    """
+    top_k_num = curr_topk_ids.size(1)
+    tokens_in_chunk = curr_hidden_states.size(0)
+    sorted_token_ids, expert_ids, num_tokens_post_padded = (
+        moe_align_block_size(curr_topk_ids,
+                             block_m,
+                             global_num_experts,
+                             expert_map,
+                             pad_sorted_ids=True))
+    inv_perm: Optional[torch.Tensor] = None
+    num_tokens = top_k_num * tokens_in_chunk
+    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
+    # Permute according to sorted token ids.
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    curr_hidden_states = _fp8_perm(curr_hidden_states,
+                                   sorted_token_ids // top_k_num)
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+            inv_perm)
+def _moe_unpermute_and_reduce(
+    out: torch.Tensor,
+    curr_hidden: torch.Tensor,
+    inv_perm: Optional[torch.Tensor],
+    topk_weight: torch.Tensor,
+    apply_router_weight_on_input: bool,
+) -> None:
+    """
+    Unpermute the final result and apply topk_weights, then perform the final
+    reduction on the hidden states.
+    """
+    M, topk = topk_weight.size()
+    K = curr_hidden.size(-1)
+    if inv_perm is not None:
+        curr_hidden = curr_hidden[inv_perm, ...]
+    curr_hidden = curr_hidden.view(-1, topk, K)
+    if not apply_router_weight_on_input:
+        curr_hidden.mul_(topk_weight.view(M, -1, 1))
+    ops.moe_sum(curr_hidden, out)
+def moe_permute(
+    hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+    expert_map: Optional[torch.Tensor] = None,
+    align_block_size: Optional[int] = None,
+    fill_invalid_expert: int = -1
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    This function expands and permutes activation to gather uncontinuous tokens
+      for each expert.
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - topk_ids (torch.Tensor): topk expert route id for each token.
+    - token_expert_indices (torch.Tensor): indice for expanded hidden.
+    - topk (int): The number of top-k experts to select.
+    - n_expert (int): The number of expert.
+    - n_local_expert (int): The number of expert in current EP rank.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - align_block_size (Optional[int]): align group gemm block size for deepgemm
+    - fill_invalid_expert(int): fill expert id in m_indices for invalid expert
+      to workaround DeepGemm unsupported -1 in m_indices
+    Returns:
+    - permuted_hidden_states (torch.Tensor): permuted activation.
+    - expert_first_token_offset (torch.Tensor): offset of the first token
+       of each expert for standard grouped gemm. if enable 'align_block_size'
+       expert_first_token_offset will align up to 'align_block_size'.
+    - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
+    - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
+    the group which the j-th row of the LHS belong to.`
+    """
+    n_token, n_hidden = hidden_states.size()
+    assert (n_hidden * hidden_states.element_size()
+            ) % 16 == 0, "permue kernel need hidden dim align to 16B"
+    permuted_row_size = n_token * topk
+    if align_block_size is not None:
+        permuted_row_size = (permuted_row_size + n_expert *
+                             (align_block_size - 1) + align_block_size -
+                             1) // align_block_size * align_block_size
+    permuted_hidden_states = torch.empty(
+        (permuted_row_size, n_hidden),
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    m_indices = torch.full((permuted_row_size, ),
+                           fill_invalid_expert,
+                           dtype=torch.int32,
+                           device=hidden_states.device)
+    expert_first_token_offset = torch.empty(n_local_expert + 1,
+                                            dtype=torch.int64,
+                                            device=hidden_states.device)
+    src_row_id2dst_row_id_map = torch.empty((n_token, topk),
+                                            dtype=torch.int32,
+                                            device=hidden_states.device)
+    torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
+                                 token_expert_indices, expert_map, n_expert,
+                                 n_local_expert, topk, align_block_size,
+                                 permuted_hidden_states,
+                                 expert_first_token_offset,
+                                 src_row_id2dst_row_id_map, m_indices)
+    return (permuted_hidden_states, expert_first_token_offset,
+            src_row_id2dst_row_id_map, m_indices)
+def moe_unpermute(
+    permuted_hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    src_row_id2dst_row_id_map: torch.Tensor,
+    expert_first_token_offset: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+) -> torch.Tensor:
+    """
+    This function expands and permutes activation to gathering uncontinuous
+      tokens for each expert.
+    Parameters:
+    - permuted_hidden_states (torch.Tensor): permuted activation.
+    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - topk_ids (torch.Tensor): topk expert route id for each token.
+    - expert_first_token_offset (torch.Tensor): offset of the first token
+       of each expert for grouped gemm.
+    - topk (int): The number of top-k experts to select.
+    - n_expert (int): The number of expert.
+    - n_local_expert (int): The number of expert in current EP rank.
+    Returns:
+    - hidden_states (torch.Tensor): The reduced and unpermuted activation
+      tensor.
+    """
+    n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
+    assert (n_hidden * permuted_hidden_states.element_size()
+            ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
+    hidden_states = torch.empty((n_token, n_hidden),
+                                dtype=permuted_hidden_states.dtype,
+                                device=permuted_hidden_states.device)
+    torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
+                                   topk_ids, src_row_id2dst_row_id_map,
+                                   expert_first_token_offset, n_expert,
+                                   n_local_expert, topk, hidden_states)
+    return hidden_states
+def moe_permute_unpermute_supported():
+    return torch.ops._moe_C.moe_permute_unpermute_supported()

vllm/model_executor/layers/fused_moe/moe_torch_iterative.py ADDED Viewed

@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn.functional as F
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+    renormalize: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
+        expert_map: [num_experts]
+    """
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.shape[:-1].numel()
+    num_experts = w1.shape[0]
+    intermediate_size = w2.shape[-1]
+    dtype = hidden_states.dtype
+    hidden_states = hidden_states.view(num_tokens, hidden_size)
+    gating_output = gating_output.view(num_tokens, global_num_experts)
+    topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
+    topk_weights, selected_experts = topk_weights.topk(topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_weights = topk_weights.to(dtype)
+    if expert_map is not None:
+        selected_experts = expert_map[selected_experts]
+    final_hidden_states = None
+    for expert_idx in range(num_experts):
+        expert_w1 = w1[expert_idx]
+        expert_w2 = w2[expert_idx]
+        expert_mask = (selected_experts == expert_idx)
+        expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True)
+        x = F.linear(hidden_states, expert_w1)
+        gate = F.silu(x[:, :intermediate_size])
+        x = x[:, intermediate_size:] * gate
+        x = F.linear(x, expert_w2)
+        current_hidden_states = x * expert_weights
+        if final_hidden_states is None:
+            final_hidden_states = current_hidden_states
+        else:
+            final_hidden_states = final_hidden_states + current_hidden_states
+    return final_hidden_states.view(orig_shape)  # type: ignore

vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py ADDED Viewed

@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import pplx_kernels as pplx
+import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+# The max_num_tokens, world_size and dp_size must be the same
+# as the ones used to create the AllToAll.
+class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
+    def __init__(self,
+                 a2a: pplx.AllToAll,
+                 max_num_tokens: int,
+                 world_size: int,
+                 rank: int,
+                 dp_size: int,
+                 quant_dtype: Optional[torch.dtype] = None,
+                 block_shape: Optional[list[int]] = None,
+                 per_act_token: bool = False):
+        super().__init__()
+        assert max_num_tokens > 0
+        self.a2a = a2a
+        self.block_shape = block_shape
+        self.max_num_tokens = max_num_tokens
+        self.world_size = world_size
+        self.rank = rank
+        self.dp_size = dp_size
+        self.quant_dtype = quant_dtype
+        self.per_act_token = per_act_token
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return self.max_num_tokens
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return torch.uint32
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        rank_topk_weights: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+        num_tokens = a1.size(0)  # M
+        hidden_dim = a1.size(-1)  # K
+        assert rank_topk_ids.size(0) == num_tokens
+        # assert expert_map is None, "NYI"
+        # Is this always going to be a1.device?
+        device = a1.device
+        if apply_router_weight_on_input:
+            topk = rank_topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, (
+                "apply_router_weight_on_input is only implemented for topk=1")
+            a1 = a1 * rank_topk_weights.to(a1.dtype)
+        repeat_cols = 4
+        repeat_rows = 1 if self.per_act_token else a1.shape[0]
+        a1q, a1q_scale = moe_kernel_quantize_input(
+            a1, (None if self.per_act_token else a1_scale), self.quant_dtype,
+            self.per_act_token, self.block_shape)
+        if a1q_scale is not None:
+            a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
+        # rem_experts need to be 0 for pplx to work properly.
+        rem_experts = num_experts % self.world_size
+        assert rem_experts == 0
+        num_local_experts = ((num_experts // self.world_size) +
+                             (1 if self.rank < rem_experts else 0))
+        expert_num_tokens = torch.empty(
+            num_local_experts,
+            dtype=torch.int32,
+            device=device,
+        )
+        num_dp = self.world_size // self.dp_size
+        expert_x = torch.empty(
+            (num_local_experts, self.max_num_tokens * num_dp, hidden_dim),
+            dtype=a1q.dtype,
+            device=device,
+        )
+        expert_x_scale: Optional[torch.Tensor] = None
+        if a1q.dtype.itemsize == 1:
+            float32_size = torch.float32.itemsize
+            block_size = (self.block_shape[0] if self.block_shape is not None
+                          else 1) * float32_size
+            expert_x_scale = torch.empty(
+                (
+                    num_local_experts,
+                    expert_x.size(1),
+                    (expert_x.size(2) + block_size - 1) // block_size,
+                ),
+                dtype=torch.float32,
+                device=device,
+            )
+        # This argument is optional, defaults to indices.size(0)
+        # There's not much point setting this unless it is != indices.size(0)
+        bound_m: Optional[torch.Tensor] = None
+        self.a2a.dispatch(
+            out_expert_num_tokens=expert_num_tokens,
+            out_expert_x=expert_x,
+            out_expert_x_scale=expert_x_scale,
+            dp_x=a1q,
+            dp_x_scale=a1q_scale,
+            indices=rank_topk_ids,
+            bound_m=bound_m,
+        )
+        if expert_x_scale is not None:
+            expert_x_scale = expert_x_scale[:, :, 0:1]
+        return expert_x, expert_x_scale, expert_num_tokens, None, None
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> None:
+        num_tokens = output.size(0)  # M
+        # This argument is optional
+        # There's not much point setting this unless it is != topk_ids.size(0)
+        bound_m: Optional[torch.Tensor] = None
+        assert topk_ids.size(0) == num_tokens, (
+            f"{topk_ids.size(0)} == {num_tokens}")
+        assert output.size(0) <= self.max_num_tokens, (
+            f"{output.size(0)} <= {self.max_num_tokens}")
+        assert output.size(1) == fused_expert_output.size(-1)
+        # Set weights to 1 if we did them in dispatch. This is hacky.
+        if apply_router_weight_on_input:
+            topk_weights = torch.ones_like(topk_weights)
+        self.a2a.combine(out_tokens=output,
+                         indices=topk_ids,
+                         weights=topk_weights,
+                         expert_y=fused_expert_output,
+                         bound_m=bound_m)

vllm/model_executor/layers/fused_moe/prepare_finalize.py ADDED Viewed

@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
+    _moe_unpermute_and_reduce)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
+class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
+    def __init__(
+        self,
+        quant_dtype: Optional[torch.dtype] = None,
+        per_channel_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+    ):
+        super().__init__()
+        self.per_channel_quant = per_channel_quant
+        self.block_shape = block_shape
+        self.quant_dtype = quant_dtype
+    def max_num_tokens_per_rank(self) -> Optional[int]:
+        return None
+    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+        return None
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, \
+                "apply_router_weight_on_input is only implemented for topk=1"
+            a1.mul_(topk_weights.to(a1.dtype))
+        a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale,
+                                                   self.quant_dtype,
+                                                   self.per_channel_quant,
+                                                   self.block_shape)
+        return a1q, a1q_scale, None, None, None
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+    ) -> None:
+        _moe_unpermute_and_reduce(output, fused_expert_output, None,
+                                  topk_weights, apply_router_weight_on_input)