PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/model_executor/layers/mamba/ops/ssd_combined.py ADDED Viewed

@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py
+# ruff: noqa: E501
+import torch
+from einops import rearrange
+from packaging import version
+from vllm.triton_utils import triton
+from .ssd_bmm import _bmm_chunk_fwd
+from .ssd_chunk_scan import _chunk_scan_fwd
+from .ssd_chunk_state import (_chunk_cumsum_fwd, _chunk_state_fwd,
+                              chunk_state_varlen)
+from .ssd_state_passing import _state_passing_fwd
+TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+def is_int_pow_2(n):
+    return isinstance(n, int) and n > 0 and (n & (n - 1)) == 0
+def _mamba_chunk_scan_combined_fwd(x,
+                                   dt,
+                                   A,
+                                   B,
+                                   C,
+                                   chunk_size,
+                                   out,
+                                   D=None,
+                                   z=None,
+                                   dt_bias=None,
+                                   initial_states=None,
+                                   seq_idx=None,
+                                   chunk_indices=None,
+                                   chunk_offsets=None,
+                                   cu_seqlens=None,
+                                   dt_softplus=False,
+                                   dt_limit=(0.0, float("inf")),
+                                   state_dtype=None):
+    assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2"
+    seqlen, nheads, headdim = x.shape
+    _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (seqlen, ngroups, dstate)
+    assert dt.shape == (seqlen, nheads)
+    assert A.shape == (nheads, )
+    assert C.shape == B.shape
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads, )
+    if seq_idx is not None:
+        assert seq_idx.shape == (seqlen, )
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if x.stride(-1) != 1 and x.stride(
+            0) != 1:  # Either M or K dimension should be contiguous
+        x = x.contiguous()
+    if z is not None and z.stride(-1) != 1 and z.stride(
+            0) != 1:  # Either M or K dimension should be contiguous
+        z = z.contiguous()
+    if D is not None and D.stride(-1) != 1:
+        D = D.contiguous()
+    assert cu_seqlens is not None, "Assuming varlen input - must supply cu_seqlens"
+    if initial_states is not None:
+        assert initial_states.shape == (len(cu_seqlens) - 1, nheads, headdim,
+                                        dstate)
+    # This function executes 5 sub-functions for computing mamba
+    # - a good resource is the blog https://goombalab.github.io/blog/2024/mamba2-part3-algorithm/
+    #   which has a minimal implementation to understand the below operations
+    # - as explained by the blog, mamba is a special case of causal attention
+    # - the idea is to chunk the attention matrix and compute each
+    #   submatrix separately using different optimizations.
+    # - see the blog and paper for a visualization of the submatrices
+    #   which we refer to in the comments below
+    # 1. Compute chunked cumsum of A * dt
+    # - here dt may go through a softplus activation
+    dA_cumsum, dt = _chunk_cumsum_fwd(dt,
+                                      A,
+                                      chunk_size,
+                                      dt_bias=dt_bias,
+                                      dt_softplus=dt_softplus,
+                                      dt_limit=dt_limit)
+    # 2. Compute the state for each intra-chunk
+    # (right term of low-rank factorization of off-diagonal blocks; B terms)
+    states = _chunk_state_fwd(B,
+                              x,
+                              dt,
+                              dA_cumsum,
+                              seq_idx=seq_idx,
+                              states_in_fp32=True)
+    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+    # (middle term of factorization of off-diag blocks; A terms)
+    # - for handling chunked prefill, this requires i) initial_states
+    #   ii) seq_idx iii) is_cont_batched and (iv) chunk_offsets to be all specified.
+    # - When a new seq_idx is detected, we will stop passing the prev_state
+    #   and switch accordingly to the init_state corresponding to the new seq_idx.
+    # - We will also make sure that the dA_cumsum is taken only from the start of the
+    #   sequence (hence we need the full dA_cumsum tensor and not just the values at chunk boundaries)
+    # - this will ensure that states will be updated with the rightmost flushed seq_idx
+    #   of the previous chunk. This implies that the first chunk of states is either 0
+    #   or equal to init_states of the first example.
+    states = _state_passing_fwd(
+        rearrange(states, "... p n -> ... (p n)"),
+        dA_cumsum,  # (nheads, nchunks, chunk_size)
+        initial_states=rearrange(initial_states, "... p n -> ... (p n)")
+        if initial_states is not None else
+        None,  # (batch, nheads, headdim*dstate)
+        seq_idx=seq_idx,
+        out_dtype=state_dtype if state_dtype is not None else C.dtype,
+        chunk_offsets=chunk_offsets)
+    states = rearrange(states, "... (p n) -> ... p n", n=dstate)
+    # 4. Compute batched matrix multiply for C_j^T B_i terms
+    CB = _bmm_chunk_fwd(C,
+                        B,
+                        chunk_size,
+                        seq_idx=seq_idx,
+                        output_dtype=torch.float32)
+    # 5. Scan and compute the diagonal blocks, taking into
+    #    account past causal states.
+    # - if initial states are provided, then states information will be
+    #   augmented with initial_states.
+    # - to do this properly, we need to account for example changes in
+    #   the continuous batch, therefore we introduce pseudo chunks, which is
+    #   a chunk that is split up each time an example changes.
+    # - in each (pseudo) chunk, we detect if the previous (pseudo) chunk had
+    #   a seq_idx change, in which case we take states information from
+    #   init_states.
+    _chunk_scan_fwd(
+        CB,
+        x,
+        dt,
+        dA_cumsum,
+        C,
+        states,
+        out,  # in-place update
+        seq_idx,
+        D=D,
+        z=z,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        initial_states=initial_states,
+    )
+    varlen_states = chunk_state_varlen(
+        B,
+        x,
+        dt,
+        dA_cumsum,
+        cu_seqlens,
+        states,
+        initial_states=initial_states,
+    )
+    return varlen_states
+def mamba_chunk_scan_combined_varlen(
+        x,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        cu_seqlens,
+        seq_idx,
+        out,
+        D=None,
+        z=None,
+        dt_bias=None,
+        initial_states=None,
+        chunk_indices=None,
+        chunk_offsets=None,
+        dt_softplus=False,
+        dt_limit=(0.0, float("inf")),
+        state_dtype=None,
+):
+    """
+    Argument:
+        x: (seqlen, nheads, headdim)
+        dt: (seqlen, nheads)
+        A: (nheads)
+        B: (seqlen, ngroups, dstate)
+        C: (seqlen, ngroups, dstate)
+        chunk_size: int
+        seq_idx: (seqlen)
+        cu_seqlens: (batch + 1)
+        out: (seqlen, nheads, headdim) preallocated output tensor
+        D: (nheads, headdim) or (nheads,)
+        z: (seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        dt_softplus: Whether to apply softplus to dt
+        out: (seqlen, nheads, headdim) preallocated output tensor
+        state_dtype: The data type of the ssm state
+    Return:
+        varlen_states: (batch, nheads, headdim, dstate)
+    """
+    assert cu_seqlens is not None, "cu_seqlens must be provided assuming varlen input"
+    assert seq_idx is not None
+    varlen_states = _mamba_chunk_scan_combined_fwd(
+        x,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        out,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        initial_states=initial_states,
+        seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        cu_seqlens=cu_seqlens,
+        dt_softplus=dt_softplus,
+        dt_limit=dt_limit,
+        state_dtype=state_dtype)
+    return varlen_states

vllm/model_executor/layers/mamba/ops/ssd_state_passing.py ADDED Viewed

@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py
+# ruff: noqa: E501
+import torch
+from vllm.triton_utils import tl, triton
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE': 64}),
+        triton.Config({'BLOCK_SIZE': 128}),
+        triton.Config({'BLOCK_SIZE': 256}),
+        triton.Config({'BLOCK_SIZE': 512}),
+        triton.Config({'BLOCK_SIZE': 1024}),
+        triton.Config({'BLOCK_SIZE': 2048}),
+    ],
+    key=['dim'],
+)
+@triton.jit
+def _state_passing_fwd_kernel(
+    # Pointers to matrices
+    states_ptr,
+    out_ptr,
+    dA_cs_ptr,
+    initstates_ptr,
+    seq_idx_ptr,
+    chunk_offsets_ptr,
+    chunk_meta_num,
+    # Matrix dimensions
+    dim: tl.constexpr,
+    nchunks,
+    seqlen,
+    chunk_size: tl.constexpr,
+    # Strides
+    stride_states_chunk: tl.int64,
+    stride_states_head: tl.int64,
+    stride_states_dim: tl.constexpr,
+    stride_out_chunk: tl.int64,
+    stride_out_head: tl.int64,
+    stride_out_dim: tl.constexpr,
+    stride_dA_cs_head: tl.int64,
+    stride_dA_cs_chunk: tl.int64,
+    stride_dA_cs_csize: tl.constexpr,
+    stride_initstates_batch: tl.int64,
+    stride_initstates_head: tl.int64,
+    stride_initstates_dim: tl.constexpr,
+    stride_seq_idx_seqlen: tl.constexpr,
+    # Meta-parameters
+    HAS_INITSTATES: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_h = tl.program_id(axis=1)
+    pid_m = tl.program_id(axis=0)
+    states_ptr += pid_h * stride_states_head
+    dA_cs_ptr += pid_h * stride_dA_cs_head + (chunk_size -
+                                              1) * stride_dA_cs_csize
+    out_ptr += pid_h * stride_out_head
+    if HAS_INITSTATES:
+        initstates_ptr += pid_h * stride_initstates_head
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    states_ptrs = states_ptr + offs_m * stride_states_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    # - states will be the past state of the sequence that continues on the current check
+    if not HAS_INITSTATES:
+        states = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
+    else:
+        initstates_ptr += offs_m * stride_initstates_dim
+        initstates_ptrs = initstates_ptr
+        # - for cont batches, for the first chunk mean it will be the first batch's
+        #   init state
+        states = tl.load(initstates_ptrs, mask=offs_m < dim,
+                         other=0.0).to(tl.float32)
+    tl.store(out_ptrs, states, mask=offs_m < dim)
+    out_ptrs += stride_out_chunk
+    prev_seq_idx_chunk_end = 0
+    logical_chunk_idx = 0
+    for c in range(nchunks - 1):
+        new_states = tl.load(states_ptrs, mask=offs_m < dim,
+                             other=0.0).to(tl.float32)
+        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
+        scale_mask = True
+        # - the seq to pass forward is the one that is flushed to the right
+        #   boundary.
+        # - that is given by seq_idx_chunk_end below: the sequence index at the end of the chunk.
+        seq_idx_chunk_end = tl.load(seq_idx_ptr +
+                                    (min((c + 1) * chunk_size, seqlen) - 1) *
+                                    stride_seq_idx_seqlen)
+        if HAS_INITSTATES:
+            if prev_seq_idx_chunk_end != seq_idx_chunk_end:
+                # this means in the current chunk the rightmost flushed seq
+                # has changed.
+                # - so we do not propagate the state from previous chunk
+                # - but rather we load that sequence's init state
+                initstates_ptrs = initstates_ptr + seq_idx_chunk_end * stride_initstates_batch
+                # - update state with seq_idx_new's init state
+                states = tl.load(initstates_ptrs, mask=offs_m < dim,
+                                 other=0.0).to(tl.float32)
+                # - we need to consider the cumsum only of the last sequence in the chunk
+                # - find its starting position (given by c_off of the logical chunk index)
+                # - and subtract the cumsum just before that position from the total cumsum
+                # - first, update the logical chunk index (add the number of sequences in the current physical chunk):
+                # sequence index at the start of the current chunk
+                seq_idx_chunk_start = tl.load(seq_idx_ptr +
+                                              min(c * chunk_size, seqlen) *
+                                              stride_seq_idx_seqlen)
+                logical_chunk_idx += seq_idx_chunk_end - seq_idx_chunk_start
+                # - load the chunk offset:
+                c_off = tl.load(chunk_offsets_ptr + logical_chunk_idx,
+                                mask=logical_chunk_idx < chunk_meta_num,
+                                other=0)
+                # - if offset is 0, then the sequence starts at the beginning of the chunk, and we don't need to subtract anything
+                if c_off > 0:
+                    # - dA_cs_ptr currently points to the cumsum at the end of the chunk - subtract the chunk size and add the offset
+                    dA_cs_boundary = tl.load(
+                        dA_cs_ptr - (chunk_size - 1) * stride_dA_cs_csize +
+                        (c_off - 1) * stride_dA_cs_csize,
+                        mask=(c_off - 1) > -1 and c_off < chunk_size,
+                        other=0.0)
+                    dA_cs -= dA_cs_boundary
+            # - increment logical chunk index for every physical chunk
+            logical_chunk_idx += 1
+        else:
+            scale_mask = seq_idx_chunk_end == prev_seq_idx_chunk_end
+        prev_seq_idx_chunk_end = seq_idx_chunk_end
+        scale = tl.where(scale_mask, tl.exp(dA_cs), 0.0)
+        states = scale * states + new_states
+        tl.store(out_ptrs, states, mask=offs_m < dim)
+        states_ptrs += stride_states_chunk
+        dA_cs_ptr += stride_dA_cs_chunk
+        out_ptrs += stride_out_chunk
+def _state_passing_fwd(
+    states,
+    dA_cumsum,
+    seq_idx,
+    chunk_offsets,
+    initial_states=None,
+    out_dtype=None,
+):
+    nchunks, nheads, dim = states.shape
+    chunk_size = dA_cumsum.shape[-1]
+    assert dA_cumsum.shape == (nheads, nchunks, chunk_size)
+    seqlen = seq_idx.shape[-1]
+    out_dtype = states.dtype if out_dtype is None else out_dtype
+    out = torch.empty((nchunks, nheads, dim),
+                      device=states.device,
+                      dtype=out_dtype)
+    initial_states_strides = ((initial_states.stride(0),
+                               initial_states.stride(1),
+                               initial_states.stride(2))
+                              if initial_states is not None else (0, 0, 0))
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), nheads)
+    with torch.cuda.device(states.device.index):
+        _state_passing_fwd_kernel[grid](
+            states_ptr=states,
+            out_ptr=out,
+            dA_cs_ptr=dA_cumsum,
+            initstates_ptr=initial_states,
+            seq_idx_ptr=seq_idx,
+            chunk_offsets_ptr=chunk_offsets,
+            chunk_meta_num=len(chunk_offsets)
+            if chunk_offsets is not None else 0,
+            dim=dim,
+            nchunks=nchunks,
+            seqlen=seqlen if seq_idx is not None else 0,
+            chunk_size=chunk_size if seq_idx is not None else 0,
+            stride_states_chunk=states.stride(0),
+            stride_states_head=states.stride(1),
+            stride_states_dim=states.stride(2),
+            stride_out_chunk=out.stride(0),
+            stride_out_head=out.stride(1),
+            stride_out_dim=out.stride(2),
+            stride_dA_cs_head=dA_cumsum.stride(0),
+            stride_dA_cs_chunk=dA_cumsum.stride(1),
+            stride_dA_cs_csize=dA_cumsum.stride(2),
+            stride_initstates_batch=initial_states_strides[0],
+            stride_initstates_head=initial_states_strides[1],
+            stride_initstates_dim=initial_states_strides[2],
+            stride_seq_idx_seqlen=seq_idx.stride(0),
+            HAS_INITSTATES=initial_states is not None,
+        )
+    return out

vllm/model_executor/layers/mamba/short_conv.py ADDED Viewed

@@ -0,0 +1,253 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+import torch
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.short_conv_attn import (
+    ShortConvAttentionMetadata)
+@CustomOp.register("short_conv")
+class ShortConv(MambaBase, CustomOp):
+    def __init__(self,
+                 config,
+                 dim: int,
+                 layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.conv_dim = dim
+        self.L_cache = config.conv_L_cache
+        self.bias = config.conv_bias
+        self.conv = ColumnParallelLinear(
+            input_size=self.L_cache,
+            output_size=dim,
+            bias=self.bias,
+            prefix=f"{prefix}.conv1d",
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv.weight.data = self.conv.weight.data.unsqueeze(1)
+        self.in_proj = MergedColumnParallelLinear(
+            input_size=dim,
+            output_sizes=[dim] * 3,
+            bias=self.bias,
+            prefix=f"{prefix}.in_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=dim,
+            output_size=dim,
+            bias=self.bias,
+            prefix=f"{prefix}.out_proj",
+        )
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+        self.kv_cache = (torch.tensor([]), )
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.prefix = prefix
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        return
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        torch.ops.vllm.short_conv(
+            hidden_states,
+            output,
+            self.prefix,
+        )
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        forward_context = get_forward_context()
+        # ShortConvAttentionMetadata contains metadata necessary for the
+        # short_conv triton kernels to operate in continuous batching and in
+        # chunked prefill modes; they are computed at top-level model forward
+        # since they stay the same and reused for all mamba layers in the same
+        # iteration.
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            assert isinstance(attn_metadata, ShortConvAttentionMetadata)
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            conv_state = self_kv_cache[0].transpose(-1, -2)
+            state_indices_tensor = attn_metadata.state_indices_tensor
+            has_initial_states_p = attn_metadata.has_initial_states
+        BCx, _ = self.in_proj(hidden_states)
+        B, C, x = BCx.chunk(3, dim=-1)
+        conv_weights = self.conv.weight.view(self.conv.weight.size(0),
+                                             self.conv.weight.size(2))
+        if attn_metadata is None:
+            # V1 profile run
+            Bx = (B * x).contiguous()
+            hidden_states = C * Bx
+            contextualized_states, _ = self.out_proj(hidden_states)
+            return contextualized_states
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        has_prefill = num_prefills > 0
+        has_decode = num_decodes > 0
+        num_actual_tokens = num_decodes + num_prefill_tokens
+        # NOTE: V1 puts decode before prefill
+        # Separate prefill and decode by splitting varlen input
+        # Split along token dimension
+        B_d, B_p = torch.split(
+            B[:num_actual_tokens],
+            [num_decodes, num_prefill_tokens],
+            dim=0,
+        )
+        C_d, C_p = torch.split(
+            C[:num_actual_tokens],
+            [num_decodes, num_prefill_tokens],
+            dim=0,
+        )
+        x_d, x_p = torch.split(
+            x[:num_actual_tokens],
+            [num_decodes, num_prefill_tokens],
+            dim=0,
+        )
+        # Split along batch dimension
+        state_indices_tensor_d, state_indices_tensor_p = torch.split(
+            state_indices_tensor,
+            [num_decodes, num_prefills],
+            dim=0,
+        )
+        query_start_loc_p = (
+            attn_metadata.query_start_loc[-num_prefills - 1:] -
+            num_decodes if has_prefill else None)
+        conv_output_list = []
+        if has_prefill:
+            Bx_p = (B_p * x_p).transpose(0, 1)
+            Bx = causal_conv1d_fn(Bx_p,
+                                  conv_weights,
+                                  self.conv.bias,
+                                  activation=None,
+                                  conv_states=conv_state,
+                                  has_initial_state=has_initial_states_p,
+                                  cache_indices=state_indices_tensor_p,
+                                  metadata=attn_metadata,
+                                  query_start_loc=query_start_loc_p).transpose(
+                                      0, 1)[:num_prefill_tokens]
+            y = C_p * Bx
+            conv_output_list.append(y)
+        if has_decode:
+            Bx_d = (B_d * x_d).contiguous()
+            Bx = causal_conv1d_update(
+                Bx_d,
+                conv_state,
+                conv_weights,
+                self.conv.bias,
+                activation=None,
+                conv_state_indices=state_indices_tensor_d)
+            y = C_d * Bx
+            conv_output_list.insert(0, y)
+        # Merge prefill and decode outputs before passing to gated MLP
+        hidden_states = torch.vstack(conv_output_list)
+        # Final linear projection
+        output[:num_actual_tokens], _ = self.out_proj(hidden_states)
+    def get_state_dtype(self) -> tuple[torch.dtype, ...]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.short_conv_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+        )
+    def get_state_shape(self) -> tuple[tuple[int, ...]]:
+        return MambaStateShapeCalculator.short_conv_state_shape(
+            tp_world_size=get_tensor_model_parallel_world_size(),
+            intermediate_size=self.conv_dim,
+            conv_kernel=self.L_cache,
+        )
+    @property
+    def mamba_type(self) -> str:
+        return "short_conv"
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.short_conv_attn import (
+            ShortConvAttentionBackend)
+        return ShortConvAttentionBackend
+def short_conv(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self.forward_cuda(hidden_states=hidden_states, output=output)
+def short_conv_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+direct_register_custom_op(
+    op_name="short_conv",
+    op_func=short_conv,
+    mutates_args=["output"],
+    fake_impl=short_conv_fake,
+)