PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/v1/kv_offload/backend.py ADDED Viewed

@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+class BlockStatus(ctypes.Structure):
+    """
+    Offloading status for a single block of KV data.
+    Holds the following information:
+    ref_cnt - the current number of transfers using this block as a source.
+        A value of -1 indicates the block is not yet ready to be read.
+    load_store_spec - backend-specific information on how to actually
+        read/write the block.
+    """
+    _fields_ = [("ref_cnt", ctypes.c_int32)]
+    def __init__(self):
+        super().__init__()
+        # initialize block as "not ready" (ref_cnt = -1)
+        self.ref_cnt = -1
+    @property
+    def is_ready(self) -> bool:
+        """
+        Returns whether the block is ready to be read.
+        """
+        return self.ref_cnt >= 0
+class Backend(ABC):
+    """
+    An abstract class for allocating and returning specs for writing
+    KV blocks to some backend.
+    """
+    def __init__(self, block_size: int, medium: str):
+        self.block_size = block_size
+        self.medium = medium
+    @abstractmethod
+    def get_num_free_blocks(self):
+        """
+        Returns the number of current number of blocks that can be allocated.
+        """
+        pass
+    @abstractmethod
+    def allocate_blocks(self,
+                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        """
+        Allocate space for writing blocks.
+        This method assumes there is enough space for allocation.
+        It is unsafe to use without checking get_num_free_blocks beforehand.
+        Args:
+            block_hashes: the hashes identifying the blocks to be written.
+        Returns:
+            A list of BlockStatus for the allocated blocks.
+            The ref_cnt of each returned item will be -1, meaning the block
+            is not yet ready to be read.
+        """
+        pass
+    @abstractmethod
+    def free(self, block: BlockStatus):
+        """
+        Free a previously allocated block.
+        You should only call this function with blocks returned by
+        allocate_blocks, and only once per each block.
+        Args:
+            block: The block to be freed.
+        """
+        pass
+    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
+                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+        """
+        Get backend-specific information on how to read/write blocks.
+        Args:
+            block_hashes: the list of block hashes identifying the blocks.
+            blocks: the list of blocks.
+        Returns:
+            A LoadStoreSpec that can be used by a worker
+            to read/write the blocks.
+        """
+        raise NotImplementedError

vllm/v1/kv_offload/backends/__init__.py ADDED Viewed

File without changes

vllm/v1/kv_offload/backends/cpu.py ADDED Viewed

@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from collections.abc import Iterable
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
+class CPUBlockStatus(BlockStatus):
+    _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)
+                                       ]  # type: ignore
+    def __init__(self, block_id: int):
+        super().__init__()
+        self.block_id = block_id
+class CPUBackend(Backend):
+    def __init__(self, block_size: int, num_blocks: int):
+        super().__init__(block_size=block_size,
+                         medium=CPULoadStoreSpec.medium())
+        self.num_blocks: int = num_blocks
+        self.num_allocated_blocks: int = 0
+        self.allocated_blocks_free_list: list[int] = []
+    def get_num_free_blocks(self):
+        return (len(self.allocated_blocks_free_list) + self.num_blocks -
+                self.num_allocated_blocks)
+    def allocate_blocks(self,
+                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        num_fresh_blocks = min(len(block_hashes),
+                               self.num_blocks - self.num_allocated_blocks)
+        num_reused_blocks = len(block_hashes) - num_fresh_blocks
+        assert len(self.allocated_blocks_free_list) >= num_reused_blocks
+        # allocate fresh blocks
+        blocks: list[BlockStatus] = []
+        for _ in range(num_fresh_blocks):
+            blocks.append(CPUBlockStatus(self.num_allocated_blocks))
+            self.num_allocated_blocks += 1
+        # allocate reused blocks
+        for _ in range(num_reused_blocks):
+            block_id = self.allocated_blocks_free_list.pop()
+            blocks.append(CPUBlockStatus(block_id))
+        return blocks
+    def free(self, block: BlockStatus):
+        assert isinstance(block, CPUBlockStatus)
+        self.allocated_blocks_free_list.append(block.block_id)
+    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
+                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+        return CPULoadStoreSpec([block.block_id for block in blocks])

vllm/v1/kv_offload/cpu.py ADDED Viewed

@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator
+from typing import Optional
+import torch
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.platforms import current_platform
+from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.backends.cpu import CPUBackend
+from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.spec import OffloadingSpec
+from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
+from vllm.v1.kv_offload.worker.worker import OffloadingHandler
+class CPUOffloadingSpec(OffloadingSpec):
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+        num_cpu_blocks = self.extra_config.get("num_cpu_blocks")
+        if not num_cpu_blocks:
+            raise Exception("num_cpu_blocks must be specified "
+                            "in kv_connector_extra_config")
+        self.num_cpu_blocks: int = num_cpu_blocks
+        # scheduler-side
+        self._manager: Optional[OffloadingManager] = None
+        # worker-side
+        self._handler: Optional[OffloadingHandler] = None
+    def get_manager(self) -> OffloadingManager:
+        if not self._manager:
+            kv_events_config = self.vllm_config.kv_events_config
+            enable_events = (kv_events_config is not None
+                             and kv_events_config.enable_kv_cache_events)
+            self._manager = LRUOffloadingManager(CPUBackend(
+                block_size=self.offloaded_block_size,
+                num_blocks=self.num_cpu_blocks),
+                                                 enable_events=enable_events)
+        return self._manager
+    def get_handlers(
+        self, kv_caches: dict[str, torch.Tensor]
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
+                        OffloadingHandler]]:
+        if not self._handler:
+            if not current_platform.is_cuda():
+                raise Exception("CPU Offloading is currently only supported"
+                                " on CUDA GPUs")
+            layer_names = list(kv_caches.keys())
+            layers = get_layers_from_vllm_config(self.vllm_config,
+                                                 AttentionLayerBase,
+                                                 layer_names)
+            attn_backends = {
+                layer_name: layers[layer_name].get_attn_backend()
+                for layer_name in layer_names
+            }
+            self._handler = CpuGpuOffloadingHandler(
+                attn_backends=attn_backends,
+                gpu_block_size=self.gpu_block_size,
+                cpu_block_size=self.offloaded_block_size,
+                num_cpu_blocks=self.num_cpu_blocks,
+                gpu_caches=kv_caches)
+        assert self._handler is not None
+        yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
+        yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler

vllm/v1/kv_offload/factory.py ADDED Viewed

@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+from typing import TYPE_CHECKING, Callable
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.spec import OffloadingSpec
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+logger = init_logger(__name__)
+class OffloadingSpecFactory:
+    _registry: dict[str, Callable[[], type[OffloadingSpec]]] = {}
+    @classmethod
+    def register_spec(cls, name: str, module_path: str,
+                      class_name: str) -> None:
+        """Register a spec with a lazy-loading module and class name."""
+        if name in cls._registry:
+            raise ValueError(f"Connector '{name}' is already registered.")
+        def loader() -> type[OffloadingSpec]:
+            module = importlib.import_module(module_path)
+            return getattr(module, class_name)
+        cls._registry[name] = loader
+    @classmethod
+    def create_spec(
+        cls,
+        config: "VllmConfig",
+    ) -> OffloadingSpec:
+        kv_transfer_config = config.kv_transfer_config
+        assert kv_transfer_config is not None
+        extra_config = kv_transfer_config.kv_connector_extra_config
+        spec_name = extra_config.get("spec_name", "CPUOffloadingSpec")
+        if spec_name in cls._registry:
+            spec_cls = cls._registry[spec_name]()
+        else:
+            spec_module_path = extra_config.get("spec_module_path")
+            if spec_module_path is None:
+                raise ValueError(f"Unsupported spec type: {spec_name}")
+            spec_module = importlib.import_module(spec_module_path)
+            spec_cls = getattr(spec_module, spec_name)
+        assert issubclass(spec_cls, OffloadingSpec)
+        logger.info("Creating offloading spec with name: %s", spec_name)
+        return spec_cls(config)
+# Register various specs here.
+OffloadingSpecFactory.register_spec("CPUOffloadingSpec",
+                                    "vllm.v1.kv_offload.cpu",
+                                    "CPUOffloadingSpec")

vllm/v1/kv_offload/lru_manager.py ADDED Viewed

@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+from typing import Optional
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
+                                         OffloadingManager, PrepareStoreOutput)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+class LRUOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager with a pluggable backend, which evicts blocks by LRU.
+    """
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        # block_hash -> BlockStatus
+        self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.events: Optional[list[OffloadingEvent]] = \
+            [] if enable_events else None
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.blocks.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.is_ready
+            block.ref_cnt += 1
+            blocks.append(block)
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if self.blocks.get(block_hash):
+                self.blocks.move_to_end(block_hash)
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.ref_cnt > 0
+            block.ref_cnt -= 1
+    def prepare_store(
+            self,
+            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        # filter out blocks that are already stored
+        block_hashes_to_store = [
+            block_hash for block_hash in block_hashes
+            if block_hash not in self.blocks
+        ]
+        num_blocks_to_evict = (len(block_hashes_to_store) -
+                               self.backend.get_num_free_blocks())
+        # build list of blocks to evict
+        to_evict = []
+        if num_blocks_to_evict > 0:
+            for block_hash, block in self.blocks.items():
+                if block.ref_cnt == 0:
+                    to_evict.append(block_hash)
+                    num_blocks_to_evict -= 1
+                    if num_blocks_to_evict == 0:
+                        break
+            else:
+                # we could not evict enough blocks
+                return None
+        # evict blocks
+        for block_hash in to_evict:
+            self.backend.free(self.blocks.pop(block_hash))
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(block_hashes=to_evict,
+                                block_size=self.backend.block_size,
+                                medium=self.backend.medium,
+                                removed=True))
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store)
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.blocks[block_hash] = block
+        # build store specs for allocated blocks
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store,
+                                                      blocks)
+        return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store,
+                                  store_spec=store_spec,
+                                  block_hashes_evicted=to_evict)
+    def complete_store(self,
+                       block_hashes: Iterable[BlockHash],
+                       success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+        if success:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    self.backend.free(block)
+                    del self.blocks[block_hash]
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(block_hashes=stored_block_hashes,
+                                block_size=self.backend.block_size,
+                                medium=self.backend.medium,
+                                removed=False))
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()

vllm/v1/kv_offload/mediums.py ADDED Viewed

@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC
+import numpy as np
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
+    """
+    Spec for loading/storing KV blocks from given block numbers.
+    """
+    def __init__(self, block_ids: list[int]):
+        self.block_ids = np.array(block_ids, dtype=np.int64)
+    def __repr__(self) -> str:
+        return repr(self.block_ids)
+class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to GPU memory.
+    """
+    @staticmethod
+    def medium() -> str:
+        return "GPU"
+class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to CPU memory.
+    """
+    @staticmethod
+    def medium() -> str:
+        return "CPU"

vllm/v1/kv_offload/spec.py ADDED Viewed

@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Iterator
+from typing import TYPE_CHECKING
+import torch
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
+from vllm.v1.kv_offload.worker.worker import OffloadingHandler
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+logger = init_logger(__name__)
+class OffloadingSpec(ABC):
+    """Spec for an offloading connector"""
+    def __init__(self, vllm_config: "VllmConfig"):
+        logger.warning(
+            "Initializing OffloadingSpec. This API is experimental and "
+            "subject to change in the future as we iterate the design.")
+        self.vllm_config = vllm_config
+        kv_transfer_config = vllm_config.kv_transfer_config
+        assert kv_transfer_config is not None
+        self.extra_config = kv_transfer_config.kv_connector_extra_config
+        self.gpu_block_size = vllm_config.cache_config.block_size
+        self.offloaded_block_size = int(
+            self.extra_config.get("block_size", self.gpu_block_size))
+        assert self.offloaded_block_size % self.gpu_block_size == 0
+    @abstractmethod
+    def get_manager(self) -> OffloadingManager:
+        """
+        Get an OffloadingManager that will be used
+        by the scheduler-side offloading connector to track
+        offloaded blocks and manage evictions.
+        """
+        pass
+    @abstractmethod
+    def get_handlers(
+        self, kv_caches: dict[str, torch.Tensor]
+    ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec],
+                        OffloadingHandler]]:
+        """
+        Get offloading handlers along with their respective src and dst types.
+        Args:
+            kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.
+        Yields:
+            Tuples of (src_type, dst_type, offloading_handler).
+        """
+        pass

vllm/v1/kv_offload/worker/__init__.py ADDED Viewed

File without changes