PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/prompt_adapter/worker_manager.py ADDED Viewed

@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Any, Optional, Set, Type
+import torch
+from vllm.adapter_commons.utils import (add_adapter_worker,
+                                        apply_adapters_worker,
+                                        list_adapters_worker,
+                                        set_active_adapters_worker)
+from vllm.adapter_commons.worker_manager import AbstractWorkerManager
+from vllm.config import PromptAdapterConfig
+from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager,
+                                        PromptAdapterModel,
+                                        PromptAdapterModelManager,
+                                        create_prompt_adapter_manager)
+from vllm.prompt_adapter.request import PromptAdapterRequest
+logger = logging.getLogger(__name__)
+class WorkerPromptAdapterManager(AbstractWorkerManager):
+    """WorkerPromptAdapterManager that manages
+    prompt_adapter models on the worker side.
+    Every request, the requested prompt_adapters will be
+    loaded (unless they are already loaded),
+    and every other prompt_adapter will be unloaded."""
+    _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager
+    def __init__(
+        self,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        device: torch.device,
+        prompt_adapter_config: PromptAdapterConfig,
+        prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel
+    ):
+        self._adapter_manager: PromptAdapterModelManager
+        self.max_num_seqs = max_num_seqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self._prompt_adapter_model_cls = prompt_adapter_model_cls
+        self.prompt_adapter_config = prompt_adapter_config
+        super().__init__(device)
+    @property
+    def is_enabled(self) -> bool:
+        return True
+    def create_prompt_adapter_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        prompt_adapter_manager = create_prompt_adapter_manager(
+            model,
+            max_num_seqs=self.max_num_seqs,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            prompt_adapter_config=self.prompt_adapter_config,
+            prompt_adapter_manager_cls=self._manager_cls,
+        )
+        self._adapter_manager = prompt_adapter_manager
+        return prompt_adapter_manager.model
+    def _load_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest
+    ) -> PromptAdapterModel:
+        try:
+            prompt_adapter = (
+                self._prompt_adapter_model_cls.from_local_checkpoint(
+                    prompt_adapter_request.prompt_adapter_local_path,
+                    prompt_adapter_id=prompt_adapter_request.prompt_adapter_id,
+                    num_virtual_tokens=prompt_adapter_request.
+                    prompt_adapter_num_virtual_tokens,
+                    config=self.prompt_adapter_config,
+                    device=str(self.device),
+                ))
+        except Exception as e:
+            raise RuntimeError(
+                f"Loading prompt_adapter "
+                f"{prompt_adapter_request.prompt_adapter_local_path}"
+                f" failed") from e
+        return prompt_adapter
+    def add_dummy_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return True
+    def pin_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.pin_adapter(adapter_id)
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        set_active_adapters_worker(requests, mapping, self._apply_adapters,
+                                   self._adapter_manager.set_adapter_mapping)
+    def add_adapter(self, adapter_request: Any) -> bool:
+        return add_adapter_worker(adapter_request, self.list_adapters,
+                                  self._load_adapter,
+                                  self._adapter_manager.add_adapter,
+                                  self._adapter_manager.activate_adapter)
+    def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
+        apply_adapters_worker(adapter_requests, self.list_adapters,
+                              self._adapter_manager.adapter_slots,
+                              self.remove_adapter, self.add_adapter)
+    def remove_adapter(self, adapter_id: int) -> bool:
+        return self._adapter_manager.remove_adapter(adapter_id)
+    def remove_all_adapters(self):
+        self._adapter_manager.remove_all_adapters()
+    def list_adapters(self) -> Set[int]:
+        return list_adapters_worker(self._adapter_manager.list_adapters)
+class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager):
+    """WorkerPromptAdapterManager that manages
+    prompt_adapter models on the worker side.
+    Uses an LRU Cache. Every request, the requested
+    prompt_adapters will be loaded (unless they are already loaded)
+    and least recently used prompt_adapters will
+    be unloaded if the cache is above capacity."""
+    _prompt_adapter_manager_cls: Type[
+        LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager
+    def create_prompt_adapter_manager(
+        self,
+        model: torch.nn.Module,
+    ) -> Any:
+        prompt_adapter_manager = create_prompt_adapter_manager(
+            model,
+            max_num_seqs=self.max_num_seqs,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            prompt_adapter_config=self.prompt_adapter_config,
+            prompt_adapter_manager_cls=self._prompt_adapter_manager_cls)
+        self._adapter_manager: LRUCachePromptAdapterModelManager = (
+            prompt_adapter_manager)
+        return prompt_adapter_manager.model
+    def _apply_adapters(
+            self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None:
+        prompt_adapters_map = {
+            prompt_adapter_request.prompt_adapter_id: prompt_adapter_request
+            for prompt_adapter_request in prompt_adapter_requests
+            if prompt_adapter_request
+        }
+        if len(prompt_adapters_map
+               ) > self._adapter_manager.prompt_adapter_slots:
+            raise RuntimeError(
+                f"Number of requested prompt_adapters "
+                f"({len(prompt_adapters_map)}) is greater "
+                "than the number of GPU prompt_adapter slots "
+                f"({self._adapter_manager.prompt_adapter_slots}).")
+        for prompt_adapter in prompt_adapters_map.values():
+            self.add_adapter(prompt_adapter)
+    def add_adapter(self,
+                    prompt_adapter_request: PromptAdapterRequest) -> bool:
+        if prompt_adapter_request.prompt_adapter_id not in self.list_adapters(
+        ):
+            # Remove before we load the new prompt_adapter to save memory
+            if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
+                self._adapter_manager.remove_oldest_adapter()
+            prompt_adapter = self._load_adapter(prompt_adapter_request)
+            loaded = self._adapter_manager.add_adapter(prompt_adapter)
+        else:
+            # If the prompt_adapter is already loaded, just touch it to
+            # update its position in the caches
+            loaded = self._adapter_manager.get_adapter(
+                prompt_adapter_request.prompt_adapter_id) is not None
+        self._adapter_manager.activate_adapter(
+            prompt_adapter_request.prompt_adapter_id)
+        return loaded

vllm/py.typed ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # Marker file for PEP 561.
2	+ # The vllm package uses inline types.

vllm/reasoning/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
+from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from .granite_reasoning_parser import GraniteReasoningParser
+__all__ = [
+    "ReasoningParser",
+    "ReasoningParserManager",
+    "DeepSeekR1ReasoningParser",
+    "GraniteReasoningParser",
+]

vllm/reasoning/abs_reasoning_parsers.py ADDED Viewed

@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from abc import abstractmethod
+from collections.abc import Sequence
+from functools import cached_property
+from typing import Callable, Optional, Union
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import import_from_path, is_list_of
+logger = init_logger(__name__)
+class ReasoningParser:
+    """
+    Abstract reasoning parser class that should not be used directly.
+    Provided and methods should be used in derived classes.
+    It is used to extract reasoning content from the model output.
+    """
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.model_tokenizer = tokenizer
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+    @abstractmethod
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token ids from the input_ids.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        list[int]
+            The extracted content from the input_ids.
+        """
+    @abstractmethod
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from a complete model-generated string.
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+        Parameters:
+        model_output: str
+            The model-generated string to extract reasoning content from.
+        request: ChatCompletionRequest
+            The request object that was used to generate the model_output.
+        Returns:
+        tuple[Optional[str], Optional[str]]
+            A tuple containing the reasoning content and the content.
+        """
+    @abstractmethod
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting reasoning
+        from an incomplete response; for use when handling reasoning calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+class ReasoningParserManager:
+    reasoning_parsers: dict[str, type] = {}
+    @classmethod
+    def get_reasoning_parser(cls, name) -> type:
+        """
+        Get reasoning parser by name which is registered by `register_module`.
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.reasoning_parsers:
+            return cls.reasoning_parsers[name]
+        raise KeyError(
+            f"reasoning helper: '{name}' not found in reasoning_parsers")
+    @classmethod
+    def _register_module(
+        cls,
+        module: type,
+        module_name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+    ) -> None:
+        if not issubclass(module, ReasoningParser):
+            raise TypeError("module must be subclass of ReasoningParser, "
+                            f"but got {type(module)}")
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.reasoning_parsers:
+                existed_module = cls.reasoning_parsers[name]
+                raise KeyError(f"{name} is already registered "
+                               f"at {existed_module.__module__}")
+            cls.reasoning_parsers[name] = module
+    @classmethod
+    def register_module(
+        cls,
+        name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+        module: Union[type, None] = None,
+    ) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f"force must be a boolean, but got {type(force)}")
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str)
+                or is_list_of(name, str)):
+            raise TypeError(
+                "name must be None, an instance of str, or a sequence of str, "
+                f"but got {type(name)}")
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+        return _register
+    @classmethod
+    def import_reasoning_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined reasoning parser by the path
+        of the reasoning parser define file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
+            return

vllm/reasoning/deepseek_r1_reasoning_parser.py ADDED Viewed

@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Sequence
+from typing import Optional, Union
+from transformers import PreTrainedTokenizerBase
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+logger = init_logger(__name__)
+@ReasoningParserManager.register_module("deepseek_r1")
+class DeepSeekR1ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for DeepSeek R1 model.
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
+    text. This parser extracts the reasoning content from the model output.
+    """
+    start_token_id: int
+    end_token_id: int
+    start_token: str = "<think>"
+    end_token: str = "</think>"
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+        self.start_token_id = self.vocab.get(self.start_token)
+        self.end_token_id = self.vocab.get(self.end_token)
+        if self.start_token_id is None or self.end_token_id is None:
+            raise RuntimeError(
+                "DeepSeek R1 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.end_token_id in input_ids
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.start_token_id, self.end_token_id
+        ]):
+            return None
+        # Check if <think> is present in previous or delta.
+        # Keep compatibility with models that don't generate <think> tokens.
+        if self.start_token_id in previous_token_ids:
+            if self.end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.start_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.start_token)
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.start_token):end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # No <think> in previous or delta, also need to check for </think>.
+            # Because the model may have generated </think> without <think>
+            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+            if self.end_token_id in delta_token_ids:
+                # </think> in delta with more tokens,
+                # extract reasoning content and content
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
+                # </think> in previous, thinking content ends
+                return DeltaMessage(content=delta_text)
+            else:
+                # no </think> in previous or delta, reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+        # Check if the start token is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+        # DeepSeek R1 doesn't generate <think> now.
+        # Thus we assume the reasoning content is always at the start.
+        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+        if self.end_token not in model_output:
+            return model_output, None
+        else:
+            reasoning_content, _, content = model_output.partition(
+                self.end_token)
+            # If the end token is not found, return the model output as is.
+            # It should not happen since we already checked for the presence
+            # of the end token.
+            # If generation stops right after end-of-think, return null content
+            final_content = content or None
+            return reasoning_content, final_content