PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py ADDED Viewed

@@ -0,0 +1,384 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+import safetensors
+import torch
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import MLACommonMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+logger = init_logger(__name__)
+@dataclass
+class ReqMeta:
+    # Request tokens
+    token_ids: torch.Tensor
+    # Slot mappings, should have the same length as token_ids
+    slot_mapping: torch.Tensor
+    # Is store or load
+    is_store: bool
+    @staticmethod
+    def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
+                  is_store: bool) -> "ReqMeta":
+        valid_num_tokens = align_to_block_size(len(token_ids), block_size)
+        token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
+        block_ids_tensor = torch.tensor(block_ids)
+        num_blocks = block_ids_tensor.shape[0]
+        block_offsets = torch.arange(0, block_size)
+        slot_mapping = block_offsets.reshape((1, block_size)) + \
+                block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
+        return ReqMeta(
+            token_ids=token_ids_tensor,
+            slot_mapping=slot_mapping,
+            is_store=is_store,
+        )
+@dataclass
+class SharedStorageConnectorMetadata(KVConnectorMetadata):
+    requests: list[ReqMeta]
+    def __init__(self):
+        self.requests = []
+    def add_request(
+        self,
+        token_ids: list[int],
+        block_ids: list[int],
+        block_size: int,
+        is_store: bool,
+    ) -> None:
+        self.requests.append(
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store))
+class SharedStorageConnector(KVConnectorBase_V1):
+    # NOTE: This is Simple debug implementation of the KV connector.
+    # It save / load the KV cache to / from the disk.
+    # It does extra work which will overwrite the existing prefix-cache in GPU
+    # - to remove the overhead, need to add some "mask" in the ReqMeta class
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._block_size = vllm_config.cache_config.block_size
+        self._requests_need_load: dict[str, Request] = {}
+        transfer_config = vllm_config.kv_transfer_config
+        self._storage_path = transfer_config.get_from_extra_config(
+            "shared_storage_path", "/tmp")
+        logger.info(vllm_config.kv_transfer_config)
+        logger.info("Shared storage path is %s", self._storage_path)
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's
+        paged KV buffer.
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+        """
+        attn_metadata = forward_context.attn_metadata
+        def inject_kv_into_layer(
+            dst_kv_cache_layer: torch.Tensor,
+            src_kv_cache: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> None:
+            """Inject the KV cache into the layer.
+            Args:
+                dst_kv_cache_layer (torch.Tensor): the destination KV cache
+                    layer. In shape [2, num_pages, page_size, xxx] if not
+                    using MLA, [num_pages, page_size, xxx] otherwise.
+                src_kv_cache (torch.Tensor): the source KV cache. In shape
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx]
+                    otherwise.
+                slot_mapping (torch.Tensor): the slot mapping. In shape
+                    [num_tokens].
+            """
+            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages = dst_kv_cache_layer_shape[0]
+                page_size = dst_kv_cache_layer_shape[1]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    num_pages * page_size, -1)
+                dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            else:
+                num_pages = dst_kv_cache_layer_shape[1]
+                page_size = dst_kv_cache_layer_shape[2]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    2, num_pages * page_size, -1)
+                dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+        # Get the metadata
+        metadata: KVConnectorMetadata = self._get_connector_metadata()
+        assert isinstance(metadata, SharedStorageConnectorMetadata)
+        if metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the connector metadata is None"
+            )
+            return
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the attn_metadata is None")
+            return
+        # Load the KV for each request each layer
+        for request in metadata.requests:
+            if request.is_store:
+                continue
+            logger.info("Inject KV cache of %d tokens to the paged memory",
+                        len(request.slot_mapping))
+            for layer_name in forward_context.no_compile_layers:
+                attn_layer = forward_context.no_compile_layers[layer_name]
+                kv_cache_layer = attn_layer.kv_cache[\
+                        forward_context.virtual_engine]
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = safetensors.torch.load_file(
+                    filename)["kv_cache"].cuda()
+                inject_kv_into_layer(kv_cache_layer, kv_cache,
+                                     request.slot_mapping)
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """Blocking until the KV for a specific layer is loaded into vLLM's
+        paged buffer.
+        This interface will be useful for layer-by-layer pipelining.
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """Start saving the KV cache of the layer from vLLM's paged buffer
+        to the connector.
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        def extract_kv_from_layer(
+            layer: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> torch.Tensor:
+            """Extract the KV cache from the layer.
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+            """
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
+            num_pages, page_size = layer.shape[1], layer.shape[2]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
+        connector_metadata = self._get_connector_metadata()
+        assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
+        for request in connector_metadata.requests:
+            if request.is_store:
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = extract_kv_from_layer(kv_layer,
+                                                 request.slot_mapping)
+                tensors = {"kv_cache": kv_cache.detach().cpu()}
+                safetensors.torch.save_file(tensors, filename)
+    def wait_for_save(self):
+        return
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+        Returns:
+            the number of tokens that can be loaded from the
+            external KV cache beyond what is already computed.
+        """
+        # NOTE: in this debug implementation, we assume that the prompt is
+        # cached_prompt + newly_generated_single_token
+        # Therefore, we use prompt_token_ids[:-1] to determine the folder name
+        # NOTE: in current v1 scheduler, the num_computed_tokens is aligned
+        # with the block granularity. And it expects the returned blocks and
+        # num_computed_tokens to also be aligned with the block granularity.
+        if not self._found_match_for_request(request):
+            return 0, False
+        logger.info("External Cache Hit!")
+        # Now, first num_tokens_to_check tokens are hit, we need to prepare
+        # the metadata for the worker connector to correctly load the KV
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+        return num_tokens_to_check - num_computed_tokens, False
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        If blocks were allocated, add to _requests_need_load,
+        such that we load the KVs in the next forward pass.
+        """
+        if num_external_tokens > 0:
+            self._requests_need_load[request.request_id] = request
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        """Build the connector metadata for this step.
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = SharedStorageConnectorMetadata()
+        total_need_load = 0
+        for new_req in scheduler_output.scheduled_new_reqs:
+            if new_req.req_id in self._requests_need_load:
+                meta.add_request(token_ids=new_req.prompt_token_ids,
+                                 block_ids=new_req.block_ids[0],
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+            else:
+                # NOTE: here, we set the store and load being exclusive,
+                # but a single request can have both store and load.
+                # NOTE(rob): for this debug implementation, we only cache
+                # the original prompt tokens.
+                if not self._found_match_for_request(new_req):
+                    meta.add_request(token_ids=new_req.prompt_token_ids,
+                                     block_ids=new_req.block_ids[0],
+                                     block_size=self._block_size,
+                                     is_store=True)
+        for cached_req in scheduler_output.scheduled_cached_reqs:
+            # NOTE(rob): here we rely on the resumed requests being
+            # the first N requests in the list scheduled_cache_reqs.
+            if not cached_req.resumed_from_preemption:
+                break
+            if cached_req.req_id in self._requests_need_load:
+                # NOTE(rob): cached_req_data does not have the full
+                # list of token ids (only new tokens). So we look it
+                # up in the actual request object.
+                request = self._requests_need_load[cached_req.req_id]
+                total_tokens = (len(cached_req.new_token_ids) +
+                                cached_req.num_computed_tokens)
+                token_ids = request.all_token_ids[:total_tokens]
+                # NOTE(rob): For resumed req, new_block_ids is all
+                # of the block_ids for the request.
+                block_ids = cached_req.new_block_ids[0]
+                meta.add_request(token_ids=token_ids,
+                                 block_ids=block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+        assert total_need_load == len(self._requests_need_load)
+        self._requests_need_load.clear()
+        return meta
+    # ==============================
+    # Helper functions
+    # ==============================
+    def _found_match_for_request(
+        self,
+        request: "Request",
+    ) -> bool:
+        """Check if the cache is hit for the request.
+        """
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+        foldername = self._generate_foldername_debug(torch.tensor(
+            request.prompt_token_ids)[:num_tokens_to_check],
+                                                     create_folder=False)
+        return os.path.exists(foldername)
+    def _generate_foldername_debug(
+        self,
+        input_ids: torch.Tensor,
+        create_folder=False,
+    ) -> str:
+        """Generate a folder name based on the hash of the bytes of the input
+        ids.
+        """
+        input_ids_bytes = input_ids.numpy().tobytes()
+        input_ids_hash = hashlib.md5(input_ids_bytes,
+                                     usedforsecurity=False).hexdigest()
+        foldername = os.path.join(self._storage_path, input_ids_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+    def _generate_filename_debug(
+        self,
+        layer_name: str,
+        input_ids: torch.Tensor,
+    ) -> str:
+        """Generate a file name based on the layer name and the hash
+        of the bytes of the input ids.
+        """
+        foldername = self._generate_foldername_debug(input_ids,
+                                                     create_folder=True)
+        return os.path.join(foldername, f"{layer_name}.safetensors")
+def align_to_block_size(num_tokens: int, block_size) -> int:
+    """Align the number of tokens to the block size.
+    """
+    return (num_tokens - 1) // block_size * block_size

vllm/distributed/kv_transfer/kv_connector_agent.py ADDED Viewed

@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A centralized entrypoint to perform distributed KV cache transfer.
+This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
+1. `send_kv_caches_and_hidden_states`
+2. `recv_kv_caches_and_hidden_states
+"""
+from typing import TYPE_CHECKING, Union
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+    from vllm.config import VllmConfig
+import torch
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+logger = init_logger(__name__)
+class KVTransferAgent:
+    """
+    A class designated for distributed KV transfer
+    Target use cases:
+        1. Disaggregated prefill
+        2. Remote KV cache storage
+    """
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: "VllmConfig",
+    ):
+        self.config = config
+        if config.kv_transfer_config is None:
+            raise ValueError("KVTransferConfig is not set in the VllmConfig,"
+                             " cannot initialize KVConnector.")
+        assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
+            "TransferAgent should only be used when kv_connector is set."
+        self.connector = KVConnectorFactory.create_connector_v0(
+            rank, local_rank, config)
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        self.connector.send_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches,
+            hidden_or_intermediate_states)
+    def close(self) -> None:
+        self.connector.close()
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        return self.connector.recv_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches)

vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py ADDED Viewed

File without changes

vllm/distributed/kv_transfer/kv_lookup_buffer/base.py ADDED Viewed

@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file contains a new class `KVLookupBufferBase` that allows developers to
+think of KV cache operations as inserting new KV cache entries (`insert`)
+into the lookup buffer and querying existing KV caches (`drop_select`)
+from the lookup buffer.
+This file also contains a new class `KVStoreBufferBase` that allows developers
+to manage the KVCache buffer as a simple key-value storage buffer with basic
+put/get operations.
+These classes above are abstracted behind class `KVCacheBufferBase`.
+"""
+from abc import ABC, abstractmethod
+from typing import Optional
+import torch
+class KVCacheBufferBase(ABC):
+    """
+    Abstract base class for a KVCache buffer.
+    """
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+        This method is responsible for cleaning up resources related to the
+        KVCache buffer when it is no longer needed.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+class KVLookupBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache lookup buffer.
+    This class provides an abstraction for a key-value (KV) cache lookup buffer.
+    The key of the lookup buffer:
+    - input_tokens: token IDs of the request
+    - roi: a binary mask on top of input_tokens.
+      - Purpose of roi: Since KV cache may only be available for a subset of
+        tokens in the input (for example, when vLLM is connected to an external
+        KV cache service), roi specifies the subset of tokens that the KV cache
+        is associated with.
+      - NOTE: roi can be further extended to describe which part of KV the
+        current process is holding (each process may only hold a part of KV
+        due to TP and PP). This is not implemented for now.
+    The value of the lookup buffer:
+    - key: the key tensor in the KV cache
+    - value: the value tensor in the KV cache
+    - hidden: the final hidden state generated by model forwarding. This allows
+      vLLM to bypass further model forwarding by transmitting the hidden state.
+    """
+    @abstractmethod
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+        """Insert into the lookup buffer.
+        The functionality is similar to the following python statement
+        ```
+        buffer[input_tokens, roi] = [key, value, hidden]
+        ```
+        FIXME: in the future, we should only have two arguments, key and value,
+        where key is a tensor dict and value is a tensor dict.
+        FIXME: we should transmit both sampler outputs and the hidden states.
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+            key (torch.Tensor): The key tensor in the KV cache.
+            value (torch.Tensor): The value tensor in the KV cache.
+            hidden (torch.Tensor): The final hidden state tensor generated
+                                   during model forwarding to bypass model
+                                   forwarding.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
+        """Select and *drop* KV cache entries from the lookup buffer.
+        The functionality is similar to the following python statements
+        ```
+        ret = buffer.pop(input_tokens, roi)
+        return ret
+        ```
+        If `input_tokens` and `roi` is `None`, it means selecting any of the
+        KV caches in the buffer, return, and remove it from the buffer, useful
+        when offloading KV cache to KV cache storage service.
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+        Returns:
+            list[Optional[torch.Tensor]]: A list of tensors. Can be None.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+class KVStoreBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache storage buffer with key-value semantics.
+    This class provides a simple key-value storage buffer abstract with basic
+    put/get operations, which enables flexible KVCache transfer granular
+    control.
+    The functionality is similar to a distributed key-value store, where:
+    - Key: A unique string identifier for the cached entry
+    - Value:
+        - Tensor to be stored and retrieved
+        - None (indicating deletion or empty value)
+    """
+    @abstractmethod
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        """Store a key-value pair in the buffer.
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+            value (Optional[torch.Tensor]): Tensor to be stored.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Retrieve a value from the buffer by key.
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+        Returns:
+            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError