PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py ADDED Viewed

@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Any, Optional
+import torch
+from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+logger = init_logger(__name__)
+class LMCacheConnectorV1(KVConnectorBase_V1):
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self)
+    # ==============================
+    # Worker-side methods
+    # ==============================
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs: Any) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+        """
+        self._lmcache_engine.start_load_kv(forward_context, **kwargs)
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+        This interface will be useful for layer-by-layer pipelining.
+        Args:
+            layer_name: the name of that layer
+        """
+        self._lmcache_engine.wait_for_layer_load(layer_name)
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
+        """
+        Start saving the a layer of KV cache from vLLM's paged buffer
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        self._lmcache_engine.save_kv_layer(layer_name, kv_layer, attn_metadata,
+                                           **kwargs)
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        self._lmcache_engine.wait_for_save()
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens.
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return self._lmcache_engine.get_finished(finished_req_ids)
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[Optional[int], bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+        Returns:
+            the number of tokens that can be loaded from the
+            external KV cache beyond what is already computed.
+        """
+        return self._lmcache_engine.get_num_new_matched_tokens(
+            request, num_computed_tokens), False
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        """
+        self._lmcache_engine.update_state_after_alloc(request,
+                                                      num_external_tokens)
+    def build_connector_meta(
+            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        return self._lmcache_engine.build_connector_meta(scheduler_output)
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        """
+        Called when a request has finished, before its blocks are freed.
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        return self._lmcache_engine.request_finished(request, block_ids)

vllm/distributed/kv_transfer/kv_connector/v1/metrics.py ADDED Viewed

@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, field
+from typing import Any, Optional, Union
+from vllm.config.kv_transfer import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_transfer_state import (
+    has_kv_transfer_group)
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+@dataclass
+class KVConnectorStats:
+    """
+    Base class for KV Connector Stats, a container for transfer performance
+    metrics or otherwise important telemetry from the connector.
+    All sub-classes need to be serializable as stats are sent from worker to
+    logger process.
+    """
+    data: dict[str, Any] = field(default_factory=dict)
+    def reset(self):
+        """Reset the stats, clear the state."""
+        raise NotImplementedError
+    def aggregate(self, other: "KVConnectorStats") -> "KVConnectorStats":
+        """
+        Aggregate stats with another `KVConnectorStats` object.
+        """
+        raise NotImplementedError
+    def reduce(self) -> dict[str, Union[int, float]]:
+        """
+        Reduce the observations collected during a time interval to one or
+        more representative values (eg avg/median/sum of the series).
+        This is meant to be called by the logger to produce a summary of the
+        stats for the last time interval.
+        """
+        raise NotImplementedError
+    def is_empty(self) -> bool:
+        """Return True if the stats are empty."""
+        raise NotImplementedError
+class KVConnectorLogging:
+    def __init__(self, kv_tranfer_config: KVTransferConfig):
+        # This should be called on frontend process.
+        assert not has_kv_transfer_group()
+        # Instantiate the connector's stats class.
+        if kv_tranfer_config and kv_tranfer_config.kv_connector:
+            self.connector_cls = KVConnectorFactory.get_connector_class(
+                kv_tranfer_config)
+        self.reset()
+    def reset(self):
+        self.transfer_stats_accumulator: Optional[KVConnectorStats] = None
+    def observe(self, transfer_stats_data: dict[str, Any]):
+        # Should not be called when a KVConnector is not configured.
+        assert self.connector_cls is not None
+        # Called periodically when connector syncs with the scheduler.
+        # Note that this is not the same as the logging interval.
+        # We expect transfer_stats_data to be aggregated across all workers and
+        # consist of observations from a single connector or a MultiConnector.
+        transfer_stats = self.connector_cls.build_kv_connector_stats(
+            transfer_stats_data)
+        if transfer_stats is None:
+            logger.warning_once(
+                "The connector %s is collecting stats but "
+                "does not implement the "
+                "`build_kv_connector_stats` method. "
+                "Stats will not be logged.", self.connector_cls)
+            return
+        if self.transfer_stats_accumulator is None:
+            self.transfer_stats_accumulator = transfer_stats
+        else:
+            # Accumulate last interval stats.
+            self.transfer_stats_accumulator = \
+                self.transfer_stats_accumulator.aggregate(transfer_stats)
+    def log(self, log_fn=logger.info):
+        """Log transfer metrics periodically, similar to throughput logging"""
+        if (self.transfer_stats_accumulator
+                and not self.transfer_stats_accumulator.is_empty()):
+            # Produce a single cumulative stats object for the last time
+            # interval from the recorded observations.
+            xfer_metrics = self.transfer_stats_accumulator.reduce()
+            xfer_metrics_str = ", ".join(f"{k}={v}"
+                                         for k, v in xfer_metrics.items())
+            log_fn("KV Transfer metrics: %s", xfer_metrics_str)
+            # Reset metrics for next interval
+            self.reset()

vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py ADDED Viewed

@@ -0,0 +1,328 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+import torch
+from vllm.config import VllmConfig
+from vllm.config.kv_transfer import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
+    KVConnectorStats)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.distributed.kv_events import KVCacheEvent
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.request import Request
+logger = init_logger(__name__)
+@dataclass
+class MultiKVConnectorMetadata(KVConnectorMetadata):
+    metadata: tuple[KVConnectorMetadata, ...]
+    extra_async_saves: Optional[dict[str, int]] = None
+@dataclass
+class MultiKVConnectorStats(KVConnectorStats):
+    """
+    Maintain a dict of KVConnectorStats objects, one for each connector.
+    This is used to aggregate the stats from all connectors separately.
+    """
+    def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
+        for connector_id, stats in other.data.items():
+            if connector_id not in self.data:
+                self[connector_id] = stats
+            else:
+                assert isinstance(stats, type(self.data[connector_id]))
+                self[connector_id] = self[connector_id].aggregate(stats)
+        return self
+    def reset(self):
+        for stats in self.data.values():
+            stats.reset()
+    def reduce(self) -> dict[str, Any]:
+        # TODO (NickLucche) Adjust for logging on separate lines
+        return {
+            connector_id: stats.reduce()
+            for connector_id, stats in self.data.items()
+        }
+    def is_empty(self) -> bool:
+        return all(stats.is_empty() for stats in self.data.values())
+    def __getitem__(self, connector_id: str) -> KVConnectorStats:
+        return self.data[connector_id]
+    def __setitem__(self, connector_id: str, stats: KVConnectorStats):
+        self.data[connector_id] = stats
+class MultiConnector(KVConnectorBase_V1):
+    """
+    A wrapper for using multiple KVConnectors at the same time.
+    The current logic is:
+    - Load KV from the first connector that advertises available tokens from
+      get_num_new_matched_tokens(), based on the order in the config.
+    - Save to all connectors.
+    """
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._connectors: list[KVConnectorBase_V1] = []
+        self._ktc_kv_transfer_config = []
+        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "connectors")
+        assert ktcs is not None
+        for ktc in ktcs:
+            temp_config = copy.copy(vllm_config)
+            engine_id = ktc.get("engine_id",
+                                vllm_config.kv_transfer_config.engine_id)
+            temp_config.kv_transfer_config = KVTransferConfig(
+                **ktc, engine_id=engine_id)
+            self._connectors.append(
+                KVConnectorFactory.create_connector(temp_config, role))
+            self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config)
+        # A mapping from request id to the index of the connector chosen to
+        # load the request from (if any).
+        self._requests_to_connector: dict[str, int] = {}
+        # Keeps track of *additional* remaining async saves (beyond 1) to be
+        # finished per request. Not needed for async loads since we only allow
+        # a single connector to load.
+        # Propagated from scheduler to worker side via the connector metadata.
+        self._extra_async_saves: dict[str, int] = {}
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        for c in self._connectors:
+            c.register_kv_caches(kv_caches)
+    # We must override the base class method here because we need to bind
+    # the metadata to each connector in the order of the connectors in the
+    # MultiKVConnectorMetadata.
+    def bind_connector_metadata(
+            self, connector_metadata: KVConnectorMetadata) -> None:
+        assert isinstance(connector_metadata, MultiKVConnectorMetadata)
+        if connector_metadata.extra_async_saves:
+            self._extra_async_saves.update(
+                connector_metadata.extra_async_saves)
+        for c, cm in zip(self._connectors, connector_metadata.metadata):
+            c.bind_connector_metadata(cm)
+    def clear_connector_metadata(self) -> None:
+        for c in self._connectors:
+            c.clear_connector_metadata()
+    def shutdown(self):
+        exception: Optional[Exception] = None
+        for c in self._connectors:
+            try:
+                c.shutdown()
+            except Exception as e:
+                logger.exception("Exception during connector %s shutdown.",
+                                 c.__class__.__name__)
+                exception = e
+        if exception:
+            raise exception
+    # ==============================
+    # Worker-side methods
+    # ==============================
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        for c in self._connectors:
+            c.start_load_kv(forward_context, **kwargs)
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        for c in self._connectors:
+            c.wait_for_layer_load(layer_name)
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        for c in self._connectors:
+            c.save_kv_layer(layer_name, kv_layer, attn_metadata, **kwargs)
+    def wait_for_save(self):
+        for c in self._connectors:
+            c.wait_for_save()
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+        finished_sending: set[str] = set()
+        finished_recving: set[str] = set()
+        for c in self._connectors:
+            sending, recving = c.get_finished(finished_req_ids)
+            if not recving and not sending:
+                continue
+            # Aggregate finished recving request ids.
+            finished_recving.update(recving or ())
+            # Aggregate finished sending request ids - only include
+            # once we've drained the "extra" count (for cases where
+            # more than one connector is async-saving the same request).
+            for req_id in sending or ():
+                extra_pending = self._extra_async_saves.get(req_id)
+                if extra_pending is None:
+                    finished_sending.add(req_id)
+                    continue
+                assert extra_pending > 0
+                if extra_pending == 1:
+                    del self._extra_async_saves[req_id]
+                else:
+                    self._extra_async_saves[req_id] = extra_pending - 1
+        return finished_sending or None, finished_recving or None
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[Optional[int], bool]:
+        to_return = (0, False)
+        for i, c in enumerate(self._connectors):
+            toks, load_async = c.get_num_new_matched_tokens(
+                request, num_computed_tokens)
+            # If there is a connector still looking up the matches,
+            # we return None to indicate that we are not done yet.
+            if toks is None:
+                return (None, False)
+            # The first connector that has new matched tokens will be assigned
+            # to this request.
+            if to_return[0] == 0 and toks > 0:
+                self._requests_to_connector[request.request_id] = i
+                to_return = (toks, load_async)
+        return to_return
+    def update_state_after_alloc(self, request: "Request",
+                                 blocks: "KVCacheBlocks",
+                                 num_external_tokens: int):
+        chosen_connector = self._requests_to_connector.get(
+            request.request_id, -1)
+        empty_blocks = blocks.new_empty()
+        for i, c in enumerate(self._connectors):
+            if i == chosen_connector:
+                # Forward call to the chosen connector (if any).
+                c.update_state_after_alloc(request, blocks,
+                                           num_external_tokens)
+            else:
+                # Call with empty blocks for other connectors.
+                c.update_state_after_alloc(request, empty_blocks, 0)
+    def build_connector_meta(
+            self,
+            scheduler_output: SchedulerOutput) -> MultiKVConnectorMetadata:
+        metadata = MultiKVConnectorMetadata(metadata=tuple(
+            c.build_connector_meta(scheduler_output)
+            for c in self._connectors))
+        if self._extra_async_saves:
+            metadata.extra_async_saves = self._extra_async_saves
+            self._extra_async_saves = {}
+        return metadata
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        for c in self._connectors:
+            c.update_connector_output(connector_output)
+    def request_finished(
+        self,
+        request: "Request",
+        blocks: list[int],
+    ) -> tuple[bool, Optional[dict[str, Any]]]:
+        async_saves = 0
+        kv_txfer_params = None
+        for c in self._connectors:
+            async_save, txfer_params = c.request_finished(request, blocks)
+            if async_save:
+                async_saves += 1
+            if txfer_params is not None:
+                if kv_txfer_params is not None:
+                    # TODO we can probably change this to merge the dicts here,
+                    # checking for key clashes.
+                    raise RuntimeError(
+                        "Only one connector can produce KV transfer params")
+                kv_txfer_params = txfer_params
+        if async_saves > 1:
+            self._extra_async_saves[request.request_id] = async_saves - 1
+        # Clean up other state for this request.
+        self._requests_to_connector.pop(request.request_id, None)
+        return async_saves > 0, kv_txfer_params
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        for c in self._connectors:
+            yield from c.take_events()
+    @classmethod
+    def get_required_kvcache_layout(
+            cls, vllm_config: "VllmConfig") -> Optional[str]:
+        """
+        Get the required KV cache layout for this connector.
+        Args:
+            vllm_config (VllmConfig): the vllm config.
+        Returns:
+            str: the required KV cache layout. e.g. HND, or NHD.
+            None if the connector does not require a specific layout.
+        """
+        ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
+            "connectors")
+        assert ktcs is not None
+        layouts: set[str] = set()
+        temp_vllm_config = copy.copy(vllm_config)
+        for ktc in ktcs:
+            kv_transfer_config = KVTransferConfig(**ktc)
+            temp_vllm_config.kv_transfer_config = kv_transfer_config
+            connector_cls = KVConnectorFactory.get_connector_class(
+                kv_transfer_config)
+            required_kvcache_layout = (
+                connector_cls.get_required_kvcache_layout(temp_vllm_config))
+            if required_kvcache_layout is not None:
+                layouts.add(required_kvcache_layout)
+        if len(layouts) > 1:
+            raise ValueError(f"KV cache layout mismatch: "
+                             f"found {len(layouts)} different layouts "
+                             f"({', '.join(layouts) })."
+                             f"All connectors must use the same layout.")
+        return next(iter(layouts), None)
+    @classmethod
+    def build_kv_connector_stats(
+            cls,
+            data: Optional[dict[str,
+                                Any]] = None) -> Optional[KVConnectorStats]:
+        return MultiKVConnectorStats(data=data) if data is not None \
+            else MultiKVConnectorStats()
+    def get_kv_connector_stats(self) -> Optional[MultiKVConnectorStats]:
+        # Group connector stats by connector type.
+        stats_by_connector: Optional[MultiKVConnectorStats] = None
+        for c in self._connectors:
+            stats = c.get_kv_connector_stats()
+            if stats is None:
+                continue
+            if stats_by_connector is None:
+                # Lazy init to allow optional return value.
+                stats_by_connector = MultiKVConnectorStats()
+            stats_by_connector[c.__class__.__name__] = stats
+        return stats_by_connector