PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/model_executor/layers/logits_processor.py ADDED Viewed

@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A layer that compute logits from hidden_stats."""
+import inspect
+from concurrent.futures import ThreadPoolExecutor
+from typing import Optional
+import torch
+import torch.nn as nn
+import vllm.envs as envs
+from vllm.distributed import (tensor_model_parallel_all_gather,
+                              tensor_model_parallel_gather)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
+_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None
+if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
+    _logits_processor_threadpool = ThreadPoolExecutor(
+        envs.VLLM_LOGITS_PROCESSOR_THREADS)
+class LogitsProcessor(nn.Module):
+    """Process logits and apply logits processors from sampling metadata.
+    This layer does the following:
+    1. Gather logits from model hidden_states.
+    2. Scale logits if needed.
+    3. Apply logits processors (if any).
+    """
+    def __init__(self,
+                 vocab_size: int,
+                 org_vocab_size: Optional[int] = None,
+                 scale: float = 1.0,
+                 logits_as_input: bool = False,
+                 soft_cap: Optional[float] = None) -> None:
+        """
+        Args:
+            scale: A scaling factor to apply to the logits.
+        """
+        super().__init__()
+        self.scale = scale
+        self.vocab_size = vocab_size
+        # Whether the input is logits (default is hidden states).
+        self.logits_as_input = logits_as_input
+        # original vocabulary size (without LoRA).
+        self.org_vocab_size = org_vocab_size or vocab_size
+        # Soft cap the logits. Used in Gemma 2.
+        self.soft_cap = soft_cap
+        # Whether to use gather or all-gather to gather the logits.
+        self.use_all_gather = current_platform.use_all_gather()
+    def forward(
+        self,
+        lm_head: VocabParallelEmbedding,
+        hidden_states: torch.Tensor,
+        sampling_metadata: Optional[SamplingMetadata] = None,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> Optional[torch.Tensor]:
+        if self.logits_as_input:
+            logits = hidden_states
+        else:
+            if sampling_metadata is not None:
+                hidden_states = _prune_hidden_states(hidden_states,
+                                                     sampling_metadata)
+            # Get the logits for the next tokens.
+            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
+        if logits is not None:
+            if self.soft_cap is not None:
+                logits = logits / self.soft_cap
+                logits = torch.tanh(logits)
+                logits = logits * self.soft_cap
+            if self.scale != 1.0:
+                logits *= self.scale
+            # Apply logits processors (if any).
+            if sampling_metadata is not None and \
+                sampling_metadata.seq_groups is not None:
+                logits = _apply_logits_processors(logits, sampling_metadata)
+        return logits
+    def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor:
+        """gather/all-gather the logits tensor across model parallel group."""
+        if self.use_all_gather:
+            # Gather is not supported for some devices such as TPUs.
+            # Use all-gather instead.
+            # NOTE(woosuk): Here, the outputs of every device should not be None
+            # because XLA requires strict SPMD among all devices. Every device
+            # should execute the same operations after gathering the logits.
+            logits = tensor_model_parallel_all_gather(logits)
+        else:
+            # None may be returned for rank > 0
+            logits = tensor_model_parallel_gather(logits)
+        return logits
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        embedding_bias: Optional[torch.Tensor],
+    ) -> Optional[torch.Tensor]:
+        # Get the logits for the next tokens.
+        logits = lm_head.quant_method.apply(lm_head,
+                                            hidden_states,
+                                            bias=embedding_bias)
+        # Gather logits for TP
+        logits = self._gather_logits(logits)
+        # Remove paddings in vocab (if any).
+        if logits is not None:
+            logits = logits[..., :self.org_vocab_size]
+        return logits
+    def extra_repr(self) -> str:
+        s = f"vocab_size={self.vocab_size}"
+        s += f", org_vocab_size={self.org_vocab_size}"
+        s += f", scale={self.scale}, logits_as_input={self.logits_as_input}"
+        return s
+def _prune_hidden_states(
+    hidden_states: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
+    # (warmup, profile_run) we might not have selected_token_indices,
+    # so we skip pruning.
+    if sampling_metadata.selected_token_indices is not None:
+        return hidden_states.index_select(
+            0, sampling_metadata.selected_token_indices)
+    else:
+        return hidden_states
+def _apply_logits_processors(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    found_logits_processors = False
+    logits_processed = 0
+    logits_row_ids_and_logits_row_futures = []
+    for seq_group in sampling_metadata.seq_groups:
+        seq_ids = seq_group.seq_ids
+        sampling_params = seq_group.sampling_params
+        logits_processors = sampling_params.logits_processors
+        if logits_processors:
+            found_logits_processors = True
+            for seq_id, logits_row_idx in zip(seq_ids,
+                                              seq_group.sample_indices):
+                logits_row = logits[logits_row_idx]
+                past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
+                prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
+                if _logits_processor_threadpool is not None:
+                    logits_row_ids_and_logits_row_futures.append(
+                        (logits_row_idx,
+                         _logits_processor_threadpool.submit(
+                             _apply_logits_processors_single_seq, logits_row,
+                             logits_processors, past_tokens_ids,
+                             prompt_tokens_ids)))
+                else:
+                    logits[logits_row_idx] = \
+                        _apply_logits_processors_single_seq(
+                            logits_row, logits_processors, past_tokens_ids,
+                            prompt_tokens_ids)
+        logits_processed += len(seq_group.sample_indices) + len(
+            seq_group.prompt_logprob_indices)
+    for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
+        logits[logits_row_idx] = future.result()
+    if found_logits_processors:
+        # verifies that no rows in logits were missed unexpectedly
+        assert logits_processed == logits.shape[0]
+    return logits
+def _apply_logits_processors_single_seq(logits_row, logits_processors,
+                                        past_tokens_ids,
+                                        prompt_tokens_ids) -> torch.Tensor:
+    for logits_processor in logits_processors:
+        parameters = inspect.signature(logits_processor).parameters
+        if len(parameters) == 3:
+            logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids,
+                                          logits_row)
+        else:
+            logits_row = logits_processor(past_tokens_ids, logits_row)
+    return logits_row

vllm/model_executor/layers/mamba/__init__.py ADDED Viewed

File without changes

vllm/model_executor/layers/mamba/mamba2_metadata.py ADDED Viewed

@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from dataclasses import dataclass
+import torch
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.placeholder_attn import (
+    PlaceholderAttentionMetadata)
+from vllm.platforms import current_platform
+@dataclass
+class Mamba2Metadata:
+    has_initial_states: torch.Tensor
+    prep_initial_states: bool
+    chunk_size: int
+    seq_idx: torch.Tensor
+    chunk_indices: torch.Tensor
+    chunk_offsets: torch.Tensor
+def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]:
+    """Returns the appropriate metadata classes for the current platform."""
+    if current_platform.is_rocm():
+        from vllm.attention.backends.rocm_flash_attn import (
+            ROCmFlashAttentionMetadata)
+        return (ROCmFlashAttentionMetadata, PlaceholderAttentionMetadata)
+    elif current_platform.is_cuda():
+        from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+        from vllm.attention.backends.xformers import XFormersMetadata
+        return (FlashAttentionMetadata, XFormersMetadata,
+                PlaceholderAttentionMetadata)
+    raise ValueError(
+        f"Unsupported platform for Mamba2: {current_platform.device_type}")
+def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
+                                              chunk_size: int,
+                                              total_seqlens: int):
+    cu_seqlens = query_start_loc[1:]  # remove prepended 0
+    # outputs will have length expansion of chunks that do not divide
+    # chunk_size
+    N = math.ceil(total_seqlens / chunk_size) + (cu_seqlens[:-1] % chunk_size
+                                                 > 0).sum()
+    chunk_indices = torch.arange(N,
+                                 dtype=torch.int,
+                                 device=query_start_loc.device)
+    chunk_offsets = torch.zeros((N, ),
+                                dtype=torch.int,
+                                device=query_start_loc.device)
+    p = 0  # num of insertions
+    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+        # if does not divide chunk_size, then there is one chunk insertion
+        p += (s % chunk_size > 0)
+        # get the dimensions
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
+        # adjust inidces and offsets
+        chunk_indices[_s:_e] -= p
+        chunk_offsets[_s] = s % chunk_size
+    return chunk_indices, chunk_offsets
+def prepare_mamba2_metadata(
+    chunk_size: int,
+    attn_metadata: AttentionMetadata,
+) -> Mamba2Metadata:
+    # compute number of prefill and decode requests
+    # NOTE: in V0 we assume prefills are before decodes
+    num_prefills = attn_metadata.num_prefills
+    num_prefill_tokens = attn_metadata.num_prefill_tokens
+    seq_idx = None
+    chunk_indices, chunk_offsets = None, None
+    # Need flags to indicate if there are initial states
+    # currently we really only support the FlashAttention backend
+    has_initial_states = None
+    prep_initial_states = False
+    # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
+    if num_prefills > 0:
+        attn_metadata_instances = get_platform_metadata_classes()
+        if (isinstance(attn_metadata, attn_metadata_instances)
+                and attn_metadata.context_lens_tensor is not None):
+            has_initial_states = \
+                attn_metadata.context_lens_tensor[:num_prefills] > 0  #[batch,]
+            # precompute flag to avoid device syncs in mamba2 layer forwards
+            # prep is only needed for mamba2 ssd prefill processing
+            prep_initial_states = torch.any(has_initial_states).item()
+        query_start_loc = attn_metadata.query_start_loc[:num_prefills + 1]
+        seq_idx = torch.repeat_interleave(torch.arange(
+            num_prefills, dtype=torch.int32, device=query_start_loc.device),
+                                          query_start_loc.diff(),
+                                          output_size=num_prefill_tokens)
+        seq_idx.unsqueeze_(0)
+        # We compute metadata for chunked prefill once at the top level model
+        # forward and reuse them in mamba layers. If not needed, they will be
+        # ignored inside mamba kernels.
+        if prep_initial_states:
+            chunk_indices, chunk_offsets = \
+                _query_start_loc_to_chunk_indices_offsets(
+                query_start_loc, chunk_size, num_prefill_tokens)
+    return Mamba2Metadata(has_initial_states=has_initial_states,
+                          prep_initial_states=prep_initial_states,
+                          chunk_size=chunk_size,
+                          seq_idx=seq_idx,
+                          chunk_indices=chunk_indices,
+                          chunk_offsets=chunk_offsets)

vllm/model_executor/layers/mamba/mamba_mixer.py ADDED Viewed

@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import nn
+from torch.nn.parameter import Parameter
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.models.mamba_cache import MambaCacheParams
+from vllm.model_executor.utils import set_weight_attrs
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+@CustomOp.register("mamba_mixer")
+class MambaMixer(CustomOp):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+    def __init__(self,
+                 hidden_size: int,
+                 ssm_state_size: int,
+                 conv_kernel_size: int,
+                 intermediate_size: int,
+                 time_step_rank: int,
+                 use_conv_bias: bool,
+                 use_bias: bool,
+                 use_rms_norm: bool,
+                 rms_norm_has_weight: bool = True,
+                 rms_norm_eps: float = 1e-5,
+                 activation="silu",
+                 is_lora_enabled: bool = False):
+        super().__init__()
+        self.time_step_rank = time_step_rank
+        self.ssm_state_size = ssm_state_size
+        self.use_rms_norm = use_rms_norm
+        self.activation = activation
+        self.is_lora_enabled = is_lora_enabled
+        self.conv1d = ColumnParallelLinear(
+            input_size=conv_kernel_size,
+            output_size=intermediate_size,
+            bias=use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+        self.in_proj = MergedColumnParallelLinear(hidden_size,
+                                                  [intermediate_size] * 2,
+                                                  bias=use_bias)
+        # selective projection used to make dt, B and C input dependent
+        self.x_proj = RowParallelLinear(
+            intermediate_size,
+            time_step_rank + ssm_state_size * 2,
+            bias=False,
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(time_step_rank,
+                                            intermediate_size,
+                                            bias=True,
+                                            skip_bias_add=True)
+        def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            tp_rank = get_tensor_model_parallel_rank()
+            tp_size = get_tensor_model_parallel_world_size()
+            param.data.copy_(
+                loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
+                                         dim=0)[tp_rank])
+        def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
+            weight_loader(param, -torch.exp(loaded_weight.float()))
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                intermediate_size // tp_size,
+                ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(intermediate_size // tp_size))
+        set_weight_attrs(self.D, {"weight_loader": weight_loader})
+        set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
+        self.out_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=use_bias,
+            input_is_parallel=True,
+        )
+        self.dt_layernorm = RMSNorm(
+            time_step_rank,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+        self.b_layernorm = RMSNorm(
+            ssm_state_size,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+        self.c_layernorm = RMSNorm(
+            ssm_state_size,
+            eps=rms_norm_eps,
+            has_weight=rms_norm_has_weight,
+        ) if use_rms_norm else None
+    def forward_native(self, hidden_states: torch.Tensor,
+                       conv_state: torch.Tensor, ssm_state: torch.Tensor):
+        pass
+    def forward_cuda(self, hidden_states: torch.Tensor,
+                     mamba_cache_params: MambaCacheParams):
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
+        hidden_states, gate = projected_states.chunk(2, dim=-2)
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        if self.is_lora_enabled:
+            #   lora kernel requires contiguous tensor
+            ssm_parameters = self.x_proj(
+                hidden_states.transpose(-2, -1).contiguous())[0]
+        else:
+            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
+        time_step, B, C = torch.split(
+            ssm_parameters,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1,
+        )
+        if self.use_rms_norm:
+            assert self.dt_layernorm is not None
+            assert self.b_layernorm is not None
+            assert self.c_layernorm is not None
+            time_step = self.dt_layernorm(time_step.contiguous())
+            B = self.b_layernorm(B.contiguous())
+            C = self.c_layernorm(C.contiguous())
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+        # 4. Final linear projection
+        if self.is_lora_enabled:
+            #  lora kernel requires contiguous tensor
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1).contiguous())[0]
+        else:
+            contextualized_states = self.out_proj(
+                scan_outputs.transpose(-2, -1))[0]
+        return contextualized_states