PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/v1/attention/backends/mla/cutlass_mla.py ADDED Viewed

@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import ClassVar, Optional, Union
+import torch
+import vllm._custom_ops as ops
+from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
+logger = init_logger(__name__)
+class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    # enable full CUDA Graph support for decode-only capture
+    cudagraph_support: ClassVar[
+        AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+class CutlassMLABackend(MLACommonBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "CUTLASS_MLA"
+    @staticmethod
+    def get_impl_cls() -> type["CutlassMLAImpl"]:
+        return CutlassMLAImpl
+    @staticmethod
+    def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]:
+        return CutlassMLAMetadataBuilder
+class SM100Workspace:
+    def __init__(self, initial_workspace_size):
+        self._workspace_buf = torch.empty(initial_workspace_size,
+                                          device="cuda",
+                                          dtype=torch.uint8)
+        self._block_size = 128  # Forced to 128
+        # Pre-compute sm_count to avoid recomputing it. Use device 0 as a proxy
+        # (assumes all devices are similar)
+        properties = torch.cuda.get_device_properties(torch.device("cuda:0"))
+        self._sm_count = properties.multi_processor_count
+    def get_buf(self):
+        return self._workspace_buf
+    def ensure_size(self, attn_metadata: MLACommonMetadata,
+                    num_kv_splits: int):
+        batch_size = attn_metadata.num_reqs
+        max_seq_len = attn_metadata.max_query_len
+        workspace_size = ops.sm100_cutlass_mla_get_workspace_size(
+            max_seq_len * self._block_size,
+            batch_size,
+            self._sm_count,
+            num_kv_splits=num_kv_splits)
+        if self._workspace_buf.shape[0] < workspace_size:
+            self._workspace_buf.resize_(workspace_size)
+g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
+MAX_HEADS = 128
+class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
+    can_return_lse_for_decode: bool = True
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads,
+                         head_size,
+                         scale,
+                         num_kv_heads,
+                         alibi_slopes,
+                         sliding_window,
+                         kv_cache_dtype,
+                         logits_soft_cap,
+                         attn_type,
+                         kv_sharing_target_layer_name,
+                         q_pad_num_heads=MAX_HEADS,
+                         **mla_args)
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "CutlassMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "CutlassMLAImpl")
+        # TODO: Currently, num_kv_splits is limited to 16 to avoid hanging
+        #       issues. In case the code hangs, use:
+        #       FORCE_NUM_KV_SPLITS=1
+        force_num_kv_splits = os.environ.get("FORCE_NUM_KV_SPLITS", None)
+        if force_num_kv_splits:
+            logger.warning_once("Forcing num_kv_splits to %d",
+                                int(force_num_kv_splits))
+            self._num_kv_splits = int(force_num_kv_splits)
+        else:
+            self._num_kv_splits = -1  # => Auto-detect
+        # Share workspace buffer across all executions
+        self._workspace = g_sm100_workspace
+    def _sm100_cutlass_mla_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        seq_lens: torch.Tensor,
+        page_table: torch.Tensor,
+        workspace: torch.Tensor,
+        sm_scale: float,
+        num_kv_splits: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert (q_nope.ndim == 3
+                ), f"q_nope must be a 3D tensor, but got {q_nope.ndim}"
+        assert (
+            q_pe.ndim == 3), f"q_pe must be a 3D tensor, but got {q_pe.ndim}"
+        assert (
+            kv_c_and_k_pe_cache.ndim == 3
+        ), "kv_c_and_k_pe_cache must be a 3D tensor, but got {}".format(
+            kv_c_and_k_pe_cache.ndim)
+        B_q, H, D_q_nope = q_nope.shape
+        B_q_2, H_2, D_q_pe = q_pe.shape
+        assert (B_q == B_q_2) and (H == H_2)
+        _, PAGE_SIZE, D_ckv = kv_c_and_k_pe_cache.shape
+        D_latent = 512
+        D_rope = 64
+        assert D_q_nope == D_latent
+        assert D_q_pe == D_rope
+        assert D_ckv == D_latent + D_rope
+        MAX_HEADS = 128
+        assert H <= MAX_HEADS, f"H must be <= {MAX_HEADS}, but got {H}"
+        assert len(page_table.shape) == 2
+        B_block_table, block_num = page_table.shape
+        assert B_block_table == B_q
+        assert (block_num
+                > 0), f"block num must be greater than 0, got {block_num}"
+        assert block_num % (128 / PAGE_SIZE) == 0
+        assert q_nope.dtype in (
+            torch.float16, torch.bfloat16, torch.float8_e4m3fn), (
+                f"q_nope.dtype needs to be fp16 or bf16 or e4m3 but got "
+                f"{q_nope.dtype}.")
+        assert q_nope.dtype == q_pe.dtype == kv_c_and_k_pe_cache.dtype
+        assert (
+            seq_lens.dtype == torch.int32
+        ), f"seq_lens.dtype needs to be int32 but got {seq_lens.dtype}."
+        assert (
+            page_table.dtype == torch.int32
+        ), f"page_table.dtype needs to be int32 but got {page_table.dtype}."
+        dtype = (torch.bfloat16 if is_quantized_kv_cache(self.kv_cache_dtype)
+                 else q_nope.dtype)
+        out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype)
+        lse = (torch.empty(
+            (B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device)
+               if self.need_to_return_lse_for_decode else torch.Tensor())
+        ops.sm100_cutlass_mla_decode(
+            out,
+            lse,
+            q_nope,
+            q_pe,
+            kv_c_and_k_pe_cache,
+            seq_lens,
+            page_table,
+            workspace,
+            sm_scale,
+            num_kv_splits,
+        )
+        if H < MAX_HEADS:
+            # Extract the subsets of the outputs
+            lse = lse[:, :H] if self.need_to_return_lse_for_decode else lse
+            out = out[:, :H]
+        return out, lse
+    def _forward_decode(
+        self,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+        layer: AttentionLayer,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        # Adjust workspace size (if necessary)
+        self._workspace.ensure_size(attn_metadata, self._num_kv_splits)
+        # Run MLA
+        o, lse = self._sm100_cutlass_mla_decode(
+            q_nope,
+            q_pe,
+            kv_c_and_k_pe_cache,
+            attn_metadata.decode.seq_lens,
+            attn_metadata.decode.block_table,
+            self._workspace.get_buf(),
+            self.scale,
+            self._num_kv_splits,
+        )
+        return o, (lse if self.need_to_return_lse_for_decode else None)

vllm/v1/attention/backends/mla/flashattn_mla.py ADDED Viewed

@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import ClassVar, Optional, Union
+import torch
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.utils.fa_utils import (flash_attn_supports_mla,
+                                           get_flash_attn_version)
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_dcp_group
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonDecodeMetadata,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
+logger = init_logger(__name__)
+class FlashAttnMLABackend(MLACommonBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN_MLA"
+    @staticmethod
+    def get_metadata_cls() -> type["FlashAttnMLAMetadata"]:
+        return FlashAttnMLAMetadata
+    @staticmethod
+    def get_builder_cls() -> type["FlashAttnMLAMetadataBuilder"]:
+        return FlashAttnMLAMetadataBuilder
+    @staticmethod
+    def get_impl_cls() -> type["FlashAttnMLAImpl"]:
+        return FlashAttnMLAImpl
+@dataclass
+class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
+    query_start_loc: torch.Tensor
+    max_query_len: int
+    max_seq_len: int
+    scheduler_metadata: Optional[torch.Tensor] = None
+    max_num_splits: int = 0
+@dataclass
+class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]):
+    pass
+class FlashAttnMLAMetadataBuilder(
+        MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_BATCH
+    reorder_batch_threshold: int = 512
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
+                         FlashAttnMLAMetadata)
+        self.max_num_splits = 0  # No upper bound on the number of splits.
+        self.fa_aot_schedule = (get_flash_attn_version() == 3)
+        self.use_full_cuda_graph = \
+            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        if self.use_full_cuda_graph and self.fa_aot_schedule:
+            self.max_cudagraph_size = self.compilation_config.max_capture_size
+            if self.max_cudagraph_size > 992:
+                # This condition derives from FA3's internal heuristic.
+                # TODO(woosuk): Support larger cudagraph sizes.
+                raise ValueError(
+                    "Capture size larger than 992 is not supported for "
+                    "full cuda graph.")
+            self.scheduler_metadata = torch.zeros(
+                vllm_config.scheduler_config.max_num_seqs + 1,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # When using cuda graph, we need to set the upper bound of the
+            # number of splits so that large enough intermediate buffers are
+            # pre-allocated during capture.
+            self.max_num_splits = (
+                envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH)
+        # TODO(lucas): Until we add support for the DCP custom masking we need
+        #   to restrict decodes to q_len == 1 when DCP is enabled.
+        self.reorder_batch_threshold = 1 \
+            if get_dcp_group().world_size > 1 else self.reorder_batch_threshold
+    def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens,
+                         max_seq_len, causal):
+        if self.fa_aot_schedule:
+            return get_scheduler_metadata(
+                batch_size=num_reqs,
+                max_seqlen_q=max_query_len,
+                max_seqlen_k=max_seq_len,
+                num_heads_q=self.num_heads,
+                num_heads_kv=1,
+                headdim=self.mla_dims.qk_rope_head_dim,
+                cache_seqlens=seqlens,
+                qkv_dtype=self.kv_cache_spec.dtype,
+                headdim_v=self.mla_dims.kv_lora_rank,
+                page_size=self.page_size,
+                cu_seqlens_q=cu_query_lens,
+                causal=causal,
+                num_splits=self.max_num_splits,
+            )
+        return None
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> FlashAttnMLADecodeMetadata:
+        query_lens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1])
+        max_query_len = query_lens_cpu.max().item()
+        max_seq_len = seq_lens_cpu.max().item()
+        scheduler_metadata = self._schedule_decode(
+            num_reqs=seq_lens_cpu.numel(),
+            cu_query_lens=query_start_loc_device,
+            max_query_len=max_query_len,
+            seqlens=seq_lens_device,
+            max_seq_len=max_seq_len,
+            causal=True,
+        )
+        # For FA3 + full cudagraph
+        max_num_splits = 0
+        if self.use_full_cuda_graph and scheduler_metadata is not None:
+            n = scheduler_metadata.shape[0]
+            # Ensure the persistent buffer is large enough
+            assert n <= self.scheduler_metadata.shape[0], \
+                f"Scheduler metadata size {n} exceeds buffer size " + \
+                f"{self.scheduler_metadata.shape[0]}"
+            self.scheduler_metadata[:n] = scheduler_metadata
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+            if num_decode_tokens <= self.max_cudagraph_size:
+                # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+                # usage, because the intermediate buffers of size [num_splits,
+                # num_heads, num_tokens, head_size] are allocated. Therefore,
+                # we only set num_splits when using cuda graphs.
+                max_num_splits = self.max_num_splits
+        return FlashAttnMLADecodeMetadata(
+            block_table=block_table_tensor,
+            seq_lens=seq_lens_device,
+            query_start_loc=query_start_loc_device,
+            max_query_len=max_query_len,
+            max_seq_len=max_seq_len,
+            scheduler_metadata=scheduler_metadata,
+            max_num_splits=max_num_splits,
+        )
+class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
+    can_return_lse_for_decode: bool = True
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+        assert flash_attn_supports_mla(), \
+            "FlashAttnMLA is not supported on this device"
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashAttnMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttnMLAImpl")
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashAttnMLA V1 with FP8 KV cache not yet supported")
+    def _forward_decode(
+        self,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: FlashAttnMLAMetadata,
+        layer: AttentionLayer,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError(
+                "FP8 FlashAttention MLA not yet supported")
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank:]
+        # NOTE(matt): During CUDA graph capture, max_query_len can be 0, but the
+        # kernel uses this to calculate grid dimensions. Ensure it's at least 1
+        # to prevent invalid grid configuration during graph capture.
+        max_seqlen_q = max(attn_metadata.decode.max_query_len, 1)
+        attn_out = flash_attn_varlen_func(
+            q=q_pe,
+            k=k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+            v=kv_c_cache.unsqueeze(-2),  # Add head dim of 1
+            q_v=q_nope,
+            max_seqlen_q=max_seqlen_q,
+            cu_seqlens_q=attn_metadata.decode.query_start_loc,
+            max_seqlen_k=attn_metadata.decode.max_seq_len,
+            seqused_k=attn_metadata.decode.seq_lens,
+            block_table=attn_metadata.decode.block_table,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=self.need_to_return_lse_for_decode,
+            fa_version=3,  # only version 3 is supported
+            scheduler_metadata=attn_metadata.decode.scheduler_metadata,
+            num_splits=attn_metadata.decode.max_num_splits,
+        )
+        if self.need_to_return_lse_for_decode:
+            o, lse = attn_out
+            # FA returns LSE in shape [ H, B ] but DCP wants [ B, H ]
+            return o, lse.transpose(0, 1)  # [ H, B ] -> [ B, H ]
+        else:
+            o = attn_out
+            return o, None

vllm/v1/attention/backends/mla/flashinfer_mla.py ADDED Viewed

@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional, Union
+import torch
+from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
+from vllm.attention.backends.abstract import AttentionLayer, AttentionType
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+logger = init_logger(__name__)
+FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+class FlashInferMLABackend(MLACommonBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER_MLA"
+    @staticmethod
+    def get_impl_cls() -> type["FlashInferMLAImpl"]:
+        return FlashInferMLAImpl
+g_fi_workspace = torch.zeros(
+    FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
+    dtype=torch.uint8,
+    device="cuda",
+)
+class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashInferMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferMLAImpl")
+        self._workspace_buffer = g_fi_workspace
+        self.bmm1_scale: Optional[float] = None
+        self.bmm2_scale: Optional[float] = None
+    def _forward_decode(
+        self,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+        layer: AttentionLayer,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+        if isinstance(q, tuple):
+            q_nope, q_pe = q
+            q = torch.cat([q_nope, q_pe], dim=-1)
+        # trtllm API requires extra dimension q_len_per_request for MTP
+        q = q.unsqueeze(1)
+        if self.bmm1_scale is None:
+            self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float *
+                               self.scale)
+        if self.bmm2_scale is None:
+            self.bmm2_scale = layer._v_scale_float
+        o = trtllm_batch_decode_with_kv_cache_mla(
+            query=q,
+            kv_cache=kv_c_and_k_pe_cache.unsqueeze(1),
+            workspace_buffer=self._workspace_buffer,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            block_tables=attn_metadata.decode.block_table,
+            seq_lens=attn_metadata.decode.seq_lens,
+            max_seq_len=attn_metadata.max_seq_len,
+            bmm1_scale=self.bmm1_scale,
+            bmm2_scale=self.bmm2_scale,
+        )
+        # TODO: Return LSE pending support from Flashinfer API:
+        # https://github.com/flashinfer-ai/flashinfer/pull/1566
+        return o, None