PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/multimodal/profiling.py ADDED Viewed

@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from typing import Generic, NamedTuple, Optional, TypeVar, cast
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+import vllm.envs as envs
+from vllm.logger import init_logger
+from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                     MultiModalInputs, MultiModalKwargs,
+                     MultiModalPlaceholderDict)
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         EncDecMultiModalProcessor)
+logger = init_logger(__name__)
+@dataclass
+class ProcessorInputs:
+    """
+    Represents the keyword arguments to
+    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    """
+    prompt_text: str
+    mm_data: MultiModalDataDict
+    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
+class DummyEncoderData(NamedTuple):
+    """Dummy data used for profiling."""
+    prompt_token_ids: list[int]
+class DummyDecoderData(NamedTuple):
+    """Dummy data used for profiling."""
+    prompt_token_ids: list[int]
+    multi_modal_data: MultiModalKwargs
+    multi_modal_placeholders: MultiModalPlaceholderDict
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+class BaseDummyInputsBuilder(ABC, Generic[_I]):
+    """
+    Abstract base class that constructs the dummy data to profile
+    multi-modal models.
+    """
+    def __init__(self, info: _I) -> None:
+        super().__init__()
+        self.info = info
+    # TODO: @abstractmethod after transition
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        """
+        Build the text input corresponding to :code:`mm_counts`.
+        """
+        if (type(self).get_dummy_processor_inputs ==
+                BaseDummyInputsBuilder.get_dummy_processor_inputs):
+            raise NotImplementedError
+        logger.warning_once("`get_dummy_processor_inputs` has been split up "
+                            "into `get_dummy_text` and `get_dummy_mm_data`. "
+                            "These two methods will be marked as abstract "
+                            "in an upcoming release.")
+        seq_len = self.info.ctx.model_config.max_model_len
+        return self.get_dummy_processor_inputs(seq_len, mm_counts).prompt_text
+    # TODO: @abstractmethod after transition
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        """
+        Build the multimodal input which, after processing, results in
+        the maximum possible number of placeholder tokens.
+        """
+        raise NotImplementedError
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        """
+        Build the input which, after processing, results in
+        the maximum possible number of placeholder tokens.
+        """
+        dummy_text = self.get_dummy_text(mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+        return ProcessorInputs(prompt_text=dummy_text, mm_data=dummy_mm_data)
+    def _get_dummy_audios(
+        self,
+        *,
+        length: int,
+        num_audios: int,
+    ) -> list[npt.NDArray]:
+        if num_audios == 0:
+            return []
+        audio = np.zeros((length, ))
+        return [audio] * num_audios
+    def _get_dummy_images(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_images: int,
+    ) -> list[Image.Image]:
+        if num_images == 0:
+            return []
+        image = Image.new("RGB", (width, height), color=255)
+        return [image] * num_images
+    def _get_dummy_videos(
+        self,
+        *,
+        width: int,
+        height: int,
+        num_frames: int,
+        num_videos: int,
+    ) -> list[npt.NDArray]:
+        if num_videos == 0:
+            return []
+        video = np.full((num_frames, width, height, 3), 255)
+        return [video] * num_videos
+class MultiModalProfiler(Generic[_I]):
+    """
+    Contains code for running memory profiling for multi-modal models.
+    """
+    def __init__(
+        self,
+        processor: BaseMultiModalProcessor[_I],
+    ) -> None:
+        super().__init__()
+        self.processor = processor
+    @property
+    def processing_info(self) -> BaseProcessingInfo:
+        return self.processor.info
+    @property
+    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
+        return self.processor.dummy_inputs
+    def get_mm_limits(self) -> Mapping[str, int]:
+        return self.processing_info.get_allowed_mm_limits()
+    def _get_dummy_mm_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> MultiModalInputs:
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
+        factory = self.dummy_inputs
+        processor_inputs = factory.get_dummy_processor_inputs(
+            seq_len, mm_counts)
+        return self.processor.apply(
+            prompt=processor_inputs.prompt_text,
+            mm_data=processor_inputs.mm_data,
+            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
+        )
+    def _get_mm_num_tokens(
+        self,
+        mm_inputs: MultiModalInputs,
+    ) -> Mapping[str, int]:
+        placeholders_by_modality = mm_inputs["mm_placeholders"]
+        return {
+            modality: sum(item.get_num_embeds() for item in placeholders)
+            for modality, placeholders in placeholders_by_modality.items()
+        }
+    def get_encoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
+        # For encoder-decoder models, use encoder prompt token ids instead of
+        # decoder prompt to construct dummy seq_data for encoder profiling.
+        encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]
+        total_len = len(encoder_prompt_token_ids)
+        processor = cast(EncDecMultiModalProcessor, self.processor)
+        if processor.pad_dummy_encoder_prompt:
+            num_tokens_to_pad = max(total_len, seq_len) - total_len
+            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
+        # NOTE: Whisper allows total_len > seq_len.
+        elif total_len > seq_len and not envs.VLLM_USE_V1:
+            # `max_num_batched_tokens` is defined by `SchedulerConfig`
+            logger.warning_once(
+                "The encoder sequence length used for profiling ("
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
+                " is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                f"({total_len} tokens in total, out of which "
+                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
+                "multi-modal embeddings). This may cause certain "
+                "multi-modal inputs to fail during inference, even when "
+                "the input text is short. To avoid this, you should "
+                "increase `max_model_len`, reduce `max_num_seqs`, "
+                "and/or reduce `mm_counts`.")
+        return DummyEncoderData(encoder_prompt_token_ids)
+    def get_decoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+        total_len = len(prompt_token_ids)
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            # `max_num_batched_tokens` is defined by `SchedulerConfig`
+            logger.warning_once(
+                "The sequence length used for profiling ("
+                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
+                "is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                f"({total_len} tokens in total, out of which "
+                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
+                "multi-modal embeddings). This may cause certain "
+                "multi-modal inputs to fail during inference, even when "
+                "the input text is short. To avoid this, you should "
+                "increase `max_model_len`, reduce `max_num_seqs`, "
+                "and/or reduce `mm_counts`.")
+        if total_len < seq_len:
+            prompt_token_ids.extend([0] * (seq_len - total_len))
+        return DummyDecoderData(
+            prompt_token_ids=prompt_token_ids,
+            multi_modal_data=mm_inputs["mm_kwargs"],
+            multi_modal_placeholders=mm_inputs["mm_placeholders"],
+        )
+    def get_mm_max_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> Mapping[str, int]:
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        return self._get_mm_num_tokens(mm_inputs)

vllm/multimodal/registry.py ADDED Viewed

@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
+import torch.nn as nn
+from typing_extensions import deprecated
+from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
+from vllm.inputs import InputProcessingContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               cached_tokenizer_from_config)
+from vllm.utils import ClassRegistry
+from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
+                         ProcessingCache)
+from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
+                        DummyEncoderData, MultiModalProfiler)
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+logger = init_logger(__name__)
+N = TypeVar("N", bound=type[nn.Module])
+_I = TypeVar("_I", bound=BaseProcessingInfo)
+_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
+class ProcessingInfoFactory(Protocol[_I_co]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    def __call__(
+        self,
+        ctx: InputProcessingContext,
+    ) -> _I_co:
+        ...
+class DummyInputsBuilderFactory(Protocol[_I]):
+    """
+    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    """
+    def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
+        ...
+class MultiModalProcessorFactory(Protocol[_I]):
+    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    def __call__(
+        self,
+        info: _I,
+        dummy_inputs: BaseDummyInputsBuilder[_I],
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ) -> BaseMultiModalProcessor[_I]:
+        ...
+@dataclass(frozen=True)
+class _ProcessorFactories(Generic[_I]):
+    info: ProcessingInfoFactory[_I]
+    processor: MultiModalProcessorFactory[_I]
+    dummy_inputs: DummyInputsBuilderFactory[_I]
+    def build_processor(
+        self,
+        ctx: InputProcessingContext,
+        *,
+        cache: Optional[ProcessingCache] = None,
+    ):
+        info = self.info(ctx)
+        dummy_inputs_builder = self.dummy_inputs(info)
+        return self.processor(info, dummy_inputs_builder, cache=cache)
+class MultiModalRegistry:
+    """
+    A registry that dispatches data processing according to the model.
+    """
+    def __init__(self) -> None:
+        self._processor_factories = ClassRegistry[nn.Module,
+                                                  _ProcessorFactories]()
+        self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB)
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
+    def create_input_mapper(self, model_config: "ModelConfig"):
+        return lambda data, mm_processor_kwargs: data
+    def get_max_tokens_per_item_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration.
+        """
+        if not model_config.is_multimodal_model:
+            return {}
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        seq_len = model_config.max_model_len
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        return profiler.get_mm_max_tokens(
+            seq_len,
+            {
+                modality: 1
+                for modality, limit in mm_limits.items() if limit > 0
+            },
+        )
+    def get_max_tokens_per_item_by_nonzero_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens per data item from each modality based
+        on underlying model configuration, excluding modalities that user
+        explicitly disabled via `limit_mm_per_prompt`.
+        Note:
+            This is currently directly used only in V1 for profiling the memory
+            usage of a model.
+        """
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        return {
+            key: max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+            if mm_limits[key] > 0
+        }
+    def get_max_tokens_by_modality(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of tokens from each modality
+        for profiling the memory usage of a model.
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+        """
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        return {
+            key: mm_limits[key] * max_tokens_per_mm_item
+            for key, max_tokens_per_mm_item in
+            self.get_max_tokens_per_item_by_modality(model_config).items()
+        }
+    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
+        """
+        Get the maximum number of multi-modal tokens
+        for profiling the memory usage of a model.
+        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
+        """
+        return sum(self.get_max_tokens_by_modality(model_config).values())
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
+    def init_mm_limits_per_prompt(
+        self,
+        model_config: "ModelConfig",
+    ) -> None:
+        pass
+    def get_mm_limits_per_prompt(
+        self,
+        model_config: "ModelConfig",
+    ) -> Mapping[str, int]:
+        """
+        Get the maximum number of multi-modal input instances for each modality
+        that are allowed per prompt for a model class.
+        """
+        if not model_config.is_multimodal_model:
+            return {}
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        return profiler.get_mm_limits()
+    def register_processor(
+        self,
+        processor: MultiModalProcessorFactory[_I],
+        *,
+        info: ProcessingInfoFactory[_I],
+        dummy_inputs: DummyInputsBuilderFactory[_I],
+    ):
+        """
+        Register a multi-modal processor to a model class. The processor
+        is constructed lazily, hence a factory method should be passed.
+        When the model receives multi-modal data, the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+        See also:
+            :ref:`mm-processing`
+        """
+        def wrapper(model_cls: N) -> N:
+            if self._processor_factories.contains(model_cls, strict=True):
+                logger.warning(
+                    "Model class %s already has a multi-modal processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+            self._processor_factories[model_cls] = _ProcessorFactories(
+                info=info,
+                dummy_inputs=dummy_inputs,
+                processor=processor,
+            )
+            return model_cls
+        return wrapper
+    def _get_model_cls(self, model_config: "ModelConfig"):
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+        model_cls, _ = get_model_architecture(model_config)
+        return model_cls
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
+    def has_processor(self, model_config: "ModelConfig") -> bool:
+        return True
+    def create_processor(
+        self,
+        model_config: "ModelConfig",
+        *,
+        tokenizer: Optional[AnyTokenizer] = None,
+        disable_cache: Optional[bool] = None,
+    ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
+        """
+        Create a multi-modal processor for a specific model and tokenizer.
+        See also:
+            :ref:`mm-processing`
+        """
+        if not model_config.is_multimodal_model:
+            raise ValueError(f"{model_config.model} is not a multimodal model")
+        if tokenizer is None:
+            tokenizer = cached_tokenizer_from_config(model_config)
+        if disable_cache is None:
+            disable_cache = model_config.disable_mm_preprocessor_cache
+        model_cls = self._get_model_cls(model_config)
+        factories = self._processor_factories[model_cls]
+        ctx = InputProcessingContext(model_config, tokenizer)
+        cache = None if disable_cache else self._processing_cache
+        return factories.build_processor(ctx, cache=cache)
+    def get_decoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            raise AssertionError(
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but found {len(token_ids)} tokens instead.")
+        return dummy_data
+    def get_encoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            logger.warning_once(
+                f"Expected at least {seq_len} dummy encoder tokens for "
+                f"profiling, but found {len(token_ids)} tokens instead.")
+        return dummy_data