PyPI - vllm-cpu-avx512bf16 - Versions diffs - 0.14.0__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

vllm-cpu-avx512bf16 0.14.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1712) hide show

vllm/v1/worker/gpu/block_table.py ADDED Viewed

@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+import torch
+from vllm.triton_utils import tl, triton
+from vllm.utils.math_utils import cdiv
+from vllm.v1.attention.backends.utils import PAD_SLOT_ID
+from vllm.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
+class BlockTables:
+    def __init__(
+        self,
+        block_sizes: list[int],
+        max_num_reqs: int,
+        max_num_batched_tokens: int,
+        max_model_len: int,
+        device: torch.device,
+    ):
+        self.block_sizes = block_sizes
+        self.max_num_reqs = max_num_reqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.max_model_len = max_model_len
+        self.device = device
+        self.num_kv_cache_groups = len(self.block_sizes)
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.block_tables: list[StagedWriteTensor] = []
+        for i in range(self.num_kv_cache_groups):
+            block_size = self.block_sizes[i]
+            max_num_blocks = cdiv(self.max_model_len, block_size)
+            block_table = StagedWriteTensor(
+                (self.max_num_reqs, max_num_blocks),
+                dtype=torch.int32,
+                device=device,
+            )
+            self.block_tables.append(block_table)
+        self.block_table_ptrs = self._make_ptr_tensor(
+            [b.gpu for b in self.block_tables]
+        )
+        self.block_table_strides = torch.tensor(
+            [b.gpu.stride(0) for b in self.block_tables],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.block_sizes_tensor = torch.tensor(
+            self.block_sizes, dtype=torch.int32, device=self.device
+        )
+        self.num_blocks = UvaBackedTensor(
+            (self.num_kv_cache_groups, self.max_num_reqs),
+            dtype=torch.int32,
+        )
+        # Block tables used for model's forward pass.
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.input_block_tables: list[torch.Tensor] = [
+            torch.zeros_like(b.gpu) for b in self.block_tables
+        ]
+        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
+        self.slot_mappings = torch.zeros(
+            self.num_kv_cache_groups,
+            self.max_num_batched_tokens,
+            dtype=torch.int64,
+            device=self.device,
+        )
+    def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
+        # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
+        return torch.tensor(
+            [t.data_ptr() for t in x],
+            dtype=torch.uint64,
+            device=self.device,
+        )
+    def append_block_ids(
+        self,
+        req_index: int,
+        new_block_ids: tuple[list[int], ...],
+        overwrite: bool,
+    ) -> None:
+        for i in range(self.num_kv_cache_groups):
+            start = self.num_blocks.np[i, req_index] if not overwrite else 0
+            block_ids = new_block_ids[i]
+            self.block_tables[i].stage_write(req_index, start, block_ids)
+            self.num_blocks.np[i, req_index] = start + len(block_ids)
+    def apply_staged_writes(self) -> None:
+        # TODO(woosuk): This can be inefficient since it launches one kernel per
+        # block table. Implement a kernel to handle all block tables at once.
+        for block_table in self.block_tables:
+            block_table.apply_write()
+        self.num_blocks.copy_to_uva()
+    def gather_block_tables(
+        self,
+        idx_mapping: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        num_reqs = idx_mapping.shape[0]
+        _gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs)](
+            idx_mapping,
+            self.block_table_ptrs,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.num_blocks.gpu,
+            self.num_blocks.gpu.stride(0),
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+    def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]:
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+    def compute_slot_mappings(
+        self,
+        query_start_loc: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        num_reqs = query_start_loc.shape[0] - 1
+        num_tokens = positions.shape[0]
+        num_groups = self.num_kv_cache_groups
+        _compute_slot_mappings_kernel[(num_groups, num_reqs + 1)](
+            num_tokens,
+            self.max_num_batched_tokens,
+            query_start_loc,
+            positions,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.block_sizes_tensor,
+            self.slot_mappings,
+            self.slot_mappings.stride(0),
+            PAD_ID=PAD_SLOT_ID,
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return self.slot_mappings[:, :num_tokens]
+    def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
+        self.slot_mappings.fill_(PAD_SLOT_ID)
+        return self.slot_mappings[:, :num_tokens]
+@triton.jit
+def _gather_block_tables_kernel(
+    batch_idx_to_req_idx,  # [batch_size]
+    src_block_table_ptrs,  # [num_kv_cache_groups]
+    dst_block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
+    num_blocks_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    batch_idx = tl.program_id(1)
+    req_idx = tl.load(batch_idx_to_req_idx + batch_idx)
+    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
+    num_blocks = tl.load(group_num_blocks_ptr + req_idx)
+    stride = tl.load(block_table_strides + group_id)
+    src_block_table_ptr = _load_ptr(src_block_table_ptrs + group_id, tl.int32)
+    src_row_ptr = src_block_table_ptr + req_idx * stride
+    dst_block_table_ptr = _load_ptr(dst_block_table_ptrs + group_id, tl.int32)
+    dst_row_ptr = dst_block_table_ptr + batch_idx * stride
+    for i in tl.range(0, num_blocks, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        block_ids = tl.load(src_row_ptr + offset, mask=offset < num_blocks)
+        tl.store(dst_row_ptr + offset, block_ids, mask=offset < num_blocks)
+@triton.jit
+def _compute_slot_mappings_kernel(
+    num_tokens,
+    max_num_tokens,
+    cu_num_tokens,  # [num_reqs + 1]
+    pos,  # [num_tokens]
+    block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    page_sizes,  # [num_kv_cache_groups]
+    slot_mappings_ptr,  # [num_kv_cache_groups, max_num_tokens]
+    slot_mappings_stride,
+    PAD_ID: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    req_idx = tl.program_id(1)
+    slot_mapping_ptr = slot_mappings_ptr + group_id * slot_mappings_stride
+    if req_idx == tl.num_programs(1) - 1:
+        # Pad remaining slots to -1. This is needed for CUDA graphs.
+        for i in range(num_tokens, max_num_tokens, BLOCK_SIZE):
+            offset = i + tl.arange(0, BLOCK_SIZE)
+            tl.store(slot_mapping_ptr + offset, PAD_ID, mask=offset < max_num_tokens)
+        return
+    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
+    block_table_stride = tl.load(block_table_strides + group_id)
+    page_size = tl.load(page_sizes + group_id)
+    start_idx = tl.load(cu_num_tokens + req_idx)
+    end_idx = tl.load(cu_num_tokens + req_idx + 1)
+    for i in range(start_idx, end_idx, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        positions = tl.load(pos + offset, mask=offset < end_idx, other=0)
+        block_indices = positions // page_size
+        block_numbers = tl.load(
+            block_table_ptr + req_idx * block_table_stride + block_indices
+        )
+        slot_ids = block_numbers * page_size + positions % page_size
+        tl.store(slot_mapping_ptr + offset, slot_ids, mask=offset < end_idx)
+@triton.jit
+def _load_ptr(ptr_to_ptr, elem_dtype):
+    ptr = tl.load(ptr_to_ptr)
+    ptr = tl.cast(ptr, tl.pointer_type(elem_dtype))
+    return tl.multiple_of(ptr, 16)

vllm/v1/worker/gpu/buffer_utils.py ADDED Viewed

@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Sequence
+import numpy as np
+import torch
+from vllm.triton_utils import tl, triton
+from vllm.utils.math_utils import next_power_of_2
+from vllm.utils.platform_utils import is_uva_available
+from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
+class UvaBuffer:
+    def __init__(self, size: int | Sequence[int], dtype: torch.dtype):
+        if not is_uva_available():
+            raise RuntimeError("UVA is not available")
+        self.cpu = torch.zeros(size, dtype=dtype, device="cpu", pin_memory=True)
+        self.np = self.cpu.numpy()
+        self.uva = get_cuda_view_from_cpu_tensor(self.cpu)
+class UvaBufferPool:
+    def __init__(
+        self,
+        size: int | Sequence[int],
+        dtype: torch.dtype,
+        max_concurrency: int = 2,
+    ):
+        self.size = size
+        self.dtype = dtype
+        self.max_concurrency = max_concurrency
+        # UVA buffers for concurrency
+        self._uva_bufs = [UvaBuffer(size, dtype) for _ in range(max_concurrency)]
+        # Current buffer index
+        self._curr = 0
+    def copy_to_uva(self, x: torch.Tensor | np.ndarray | list) -> torch.Tensor:
+        # Round robin to the next buffer.
+        self._curr = (self._curr + 1) % self.max_concurrency
+        buf = self._uva_bufs[self._curr]
+        # CPU-to-CPU copy
+        dst = buf.cpu if isinstance(x, torch.Tensor) else buf.np
+        n = len(x)
+        dst[:n] = x
+        return buf.uva[:n]
+    def copy_to_gpu(
+        self,
+        x: torch.Tensor | np.ndarray,
+        out: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        uva = self.copy_to_uva(x)
+        if out is None:
+            # CPU-to-GPU copy
+            return uva.clone()
+        # CPU-to-GPU copy
+        return out.copy_(uva, non_blocking=True)
+class UvaBackedTensor:
+    def __init__(
+        self,
+        size: int | Sequence[int],
+        dtype: torch.dtype,
+        max_concurrency: int = 2,
+    ):
+        self.dtype = dtype
+        self.max_concurrency = max_concurrency
+        # Source of truth
+        self.cpu = torch.zeros(size, dtype=dtype, device="cpu", pin_memory=False)
+        self.np = self.cpu.numpy()
+        # Buffers for concurrency
+        self.pool = UvaBufferPool(size, dtype, max_concurrency)
+        self.gpu = self.pool.copy_to_uva(self.np)
+    def copy_to_uva(self, n: int | None = None) -> torch.Tensor:
+        # CPU-to-CPU copy
+        self.gpu = self.pool.copy_to_uva(self.np[:n] if n is not None else self.np)
+        return self.gpu
+class StagedWriteTensor:
+    def __init__(
+        self,
+        size: int | Sequence[int],
+        dtype: torch.dtype,
+        device: torch.device,
+        max_concurrency: int = 2,
+        uva_instead_of_gpu: bool = False,
+    ):
+        supported_dtypes = [torch.int32, torch.int64, torch.float32]
+        if dtype not in supported_dtypes:
+            raise ValueError(
+                f"Unsupported dtype {dtype}: should be one of {supported_dtypes}"
+            )
+        self.num_rows = size if isinstance(size, int) else size[0]
+        self.dtype = dtype
+        self.max_concurrency = max_concurrency
+        if not uva_instead_of_gpu:
+            # Create a GPU tensor (default)
+            self.gpu = torch.zeros(size, dtype=dtype, device=device)
+        else:
+            # For a large but not-frequently-accessed tensor, we can use UVA instead of
+            # GPU to save GPU memory
+            self._uva_buf = UvaBuffer(size, dtype)
+            self.gpu = self._uva_buf.uva
+        self._staged_write_indices: list[int] = []
+        self._staged_write_starts: list[int] = []
+        self._staged_write_contents: list[int | float] = []
+        self._staged_write_cu_lens: list[int] = []
+        self.write_indices = UvaBufferPool(
+            self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
+        )
+        self.write_starts = UvaBufferPool(
+            self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
+        )
+        init_size = next_power_of_2(self.num_rows)
+        self.write_contents = UvaBufferPool(
+            init_size, dtype=dtype, max_concurrency=max_concurrency
+        )
+        self.write_cu_lens = UvaBufferPool(
+            self.num_rows, dtype=torch.int32, max_concurrency=max_concurrency
+        )
+    def stage_write(
+        self,
+        index: int,
+        start: int,
+        x: Iterable[int] | Iterable[float],
+    ) -> None:
+        assert index >= 0
+        assert start >= 0
+        if not x:
+            return
+        self._staged_write_indices.append(index)
+        self._staged_write_starts.append(start)
+        self._staged_write_contents.extend(x)
+        self._staged_write_cu_lens.append(len(self._staged_write_contents))
+    def stage_write_elem(self, index: int, x: int) -> None:
+        assert index >= 0
+        self._staged_write_indices.append(index)
+        self._staged_write_starts.append(0)
+        self._staged_write_contents.append(x)
+        self._staged_write_cu_lens.append(len(self._staged_write_contents))
+    def apply_write(self) -> None:
+        n = len(self._staged_write_indices)
+        if n == 0:
+            return
+        indices_uva = self.write_indices.copy_to_uva(self._staged_write_indices)
+        starts_uva = self.write_starts.copy_to_uva(self._staged_write_starts)
+        cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens)
+        # Special handling for write_contents
+        diff_len = len(self._staged_write_contents)
+        assert isinstance(self.write_contents.size, int)
+        if diff_len > self.write_contents.size:
+            # Re-allocate a larger buffer for the write_contents
+            new_size = next_power_of_2(diff_len)
+            self.write_contents = UvaBufferPool(
+                new_size, dtype=self.dtype, max_concurrency=self.max_concurrency
+            )
+            # NOTE(woosuk): Since the previous write_contents buffer is released,
+            # we perform a synchronization here to ensure that all data transfers
+            # involving the old buffer have finished before allocating a new one.
+            # This prevents potential race conditions. The slight overhead is
+            # negligible because the reallocations are infrequent in practice.
+            torch.cuda.synchronize()
+        contents_uva = self.write_contents.copy_to_uva(self._staged_write_contents)
+        # Write diffs to the GPU buffer
+        _apply_write_kernel[(n,)](
+            self.gpu,
+            self.gpu.stride(0),
+            indices_uva,
+            starts_uva,
+            contents_uva,
+            cu_lens_uva,
+            BLOCK_SIZE=1024,
+        )
+        # Clear the staged writes
+        self.clear_staged_writes()
+    def clear_staged_writes(self) -> None:
+        self._staged_write_indices.clear()
+        self._staged_write_starts.clear()
+        self._staged_write_contents.clear()
+        self._staged_write_cu_lens.clear()
+@triton.jit
+def _apply_write_kernel(
+    output_ptr,
+    output_stride,
+    write_indices_ptr,
+    write_starts_ptr,
+    write_contents_ptr,
+    write_cu_lens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    row_idx = tl.load(write_indices_ptr + pid)
+    start_idx = tl.load(write_starts_ptr + pid)
+    cu_start = tl.load(write_cu_lens_ptr + pid - 1) if pid > 0 else 0
+    cu_end = tl.load(write_cu_lens_ptr + pid)
+    content_len = cu_end - cu_start
+    for i in range(0, content_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < content_len
+        content = tl.load(write_contents_ptr + cu_start + block, mask=mask)
+        tl.store(
+            output_ptr + row_idx * output_stride + start_idx + block, content, mask=mask
+        )

vllm/v1/worker/gpu/cudagraph_utils.py ADDED Viewed

@@ -0,0 +1,264 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable, Iterable
+from typing import Any
+import numpy as np
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.distributed.parallel_state import graph_capture, is_global_first_rank
+from vllm.forward_context import set_forward_context
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+class CudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        uses_mrope: bool,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.uses_mrope = uses_mrope
+        self.device = device
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+        self.cudagraph_mode: CUDAGraphMode
+        if self.compilation_config.cudagraph_mode is None:
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+        self.hidden_states: torch.Tensor | None = None
+    def needs_capture(self) -> bool:
+        return len(self.cudagraph_sizes) > 0
+    def get_cudagraph_size(
+        self,
+        scheduler_output: SchedulerOutput,
+        num_tokens_after_padding: int,
+    ) -> int | None:
+        return get_cudagraph_size(
+            num_tokens_after_padding,
+            scheduler_output.num_scheduled_tokens.values(),
+            self.cudagraph_sizes,
+            self.cudagraph_mode,
+        )
+    def capture_graph(
+        self,
+        num_tokens: int,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_ids = input_buffers.input_ids[:num_tokens]
+        if not self.uses_mrope:
+            positions = input_buffers.positions[:num_tokens]
+        else:
+            positions = input_buffers.mrope_positions[:, :num_tokens]
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
+        )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+        # Warm up.
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            if self.hidden_states is None:
+                self.hidden_states = torch.empty_like(hidden_states)
+        # Capture the graph.
+        assert num_tokens not in self.graphs
+        graph = torch.cuda.CUDAGraph()
+        with (
+            set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=num_tokens,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ),
+            torch.cuda.graph(graph, self.pool),
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            self.hidden_states[:num_tokens] = hidden_states
+        self.graphs[num_tokens] = graph
+    @torch.inference_mode()
+    def capture(
+        self,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            model=model,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
+    def run(self, num_tokens: int) -> torch.Tensor:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()
+        assert self.hidden_states is not None
+        return self.hidden_states[:num_tokens]
+def get_cudagraph_sizes(
+    capture_sizes: list[int] | None,
+    max_num_reqs: int,
+    max_num_tokens: int,
+    cudagraph_mode: CUDAGraphMode,
+) -> dict[int, int]:
+    if not cudagraph_mode.has_full_cudagraphs():
+        return {}
+    if not capture_sizes:
+        return {}
+    capture_sizes = sorted(capture_sizes)
+    # Limit the capture sizes to the max number of requests or tokens.
+    upper_bound = (
+        max_num_reqs
+        if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY
+        else max_num_tokens
+    )
+    capture_sizes = [x for x in capture_sizes if x <= upper_bound]
+    if not capture_sizes:
+        return {}
+    cudagraph_sizes: dict[int, int] = {}
+    for i in range(1, capture_sizes[-1] + 1):
+        for x in capture_sizes:
+            if i <= x:
+                cudagraph_sizes[i] = x
+                break
+    return cudagraph_sizes
+def get_cudagraph_size(
+    num_tokens_after_dp_padding: int,
+    num_tokens_per_request: Iterable[int],
+    cudagraph_sizes: dict[int, int],
+    cudagraph_mode: CUDAGraphMode,
+) -> int | None:
+    size = cudagraph_sizes.get(num_tokens_after_dp_padding)
+    if size is None:
+        # No CUDA graph for this size.
+        return None
+    if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+        all_decode = all(x == 1 for x in num_tokens_per_request)
+        if not all_decode:
+            # Prefill is included.
+            return None
+    return size
+def capture_graphs(
+    cudagraph_sizes: dict[int, int],
+    device: torch.device,
+    capture_fn: Callable,
+    **capture_kwargs,
+) -> None:
+    # Capture larger graphs first.
+    sizes_to_capture = sorted(set(cudagraph_sizes.values()), reverse=True)
+    if is_global_first_rank():
+        sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
+    with graph_capture(device=device):
+        for size in sizes_to_capture:
+            capture_fn(size, **capture_kwargs)
+def prepare_inputs_to_capture(
+    num_reqs: int,
+    num_tokens: int,
+    input_buffers: InputBuffers,
+    block_tables: BlockTables,
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    max_model_len: int,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    num_tokens_per_req = num_tokens // num_reqs
+    query_start_loc_np = np.arange(num_reqs + 1, dtype=np.int32) * num_tokens_per_req
+    query_start_loc_np[-1] = num_tokens
+    query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
+    input_buffers.query_start_loc[: num_reqs + 1] = query_start_loc_cpu
+    input_buffers.query_start_loc[num_reqs + 1 :] = num_tokens
+    query_start_loc = input_buffers.query_start_loc[: num_reqs + 1]
+    # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
+    # rather than max_model_len.
+    input_buffers.seq_lens[:num_reqs] = num_tokens
+    input_buffers.seq_lens[num_reqs:] = 0
+    input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
+    slot_mappings = block_tables.slot_mappings[:, :num_tokens]
+    attn_metadata = build_attn_metadata(
+        attn_metadata_builders=attn_metadata_builders,
+        num_reqs=num_reqs,
+        num_tokens=num_tokens,
+        query_start_loc_gpu=query_start_loc,
+        query_start_loc_cpu=query_start_loc_cpu,
+        seq_lens=input_buffers.seq_lens,
+        max_seq_len=max_model_len,
+        block_tables=input_block_tables,
+        slot_mappings=slot_mappings,
+        kv_cache_config=kv_cache_config,
+    )
+    return attn_metadata