PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/multimodal/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .base import MultiModalPlaceholderMap
+from .hasher import MultiModalHashDict, MultiModalHasher
+from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
+                     MultiModalDataDict, MultiModalKwargs,
+                     MultiModalPlaceholderDict, NestedTensors)
+from .registry import MultiModalRegistry
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""
+The global [`MultiModalRegistry`][vllm.multimodal.registry.MultiModalRegistry]
+is used by model runners to dispatch data processing according to the target
+model.
+Info:
+    [mm_processing](../../../design/mm_processing.html)
+"""
+__all__ = [
+    "BatchedTensorInputs",
+    "ModalityData",
+    "MultiModalDataBuiltins",
+    "MultiModalDataDict",
+    "MultiModalHashDict",
+    "MultiModalHasher",
+    "MultiModalKwargs",
+    "MultiModalPlaceholderDict",
+    "MultiModalPlaceholderMap",
+    "NestedTensors",
+    "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry",
+]

vllm/multimodal/audio.py ADDED Viewed

@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+from io import BytesIO
+from pathlib import Path
+from typing import Literal, Optional
+import numpy as np
+import numpy.typing as npt
+from vllm.utils import PlaceholderModule
+from .base import MediaIO
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
+def resample_audio_librosa(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+def resample_audio_scipy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+):
+    # lazy import scipy.signal, otherwise it will crash doc build.
+    import scipy.signal
+    if orig_sr > target_sr:
+        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+    elif orig_sr < target_sr:
+        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+    return audio
+class AudioResampler:
+    """Resample audio data to a target sample rate."""
+    def __init__(
+        self,
+        target_sr: Optional[float] = None,
+        method: Literal["librosa", "scipy"] = "librosa",
+    ):
+        self.target_sr = target_sr
+        self.method = method
+    def resample(
+        self,
+        audio: npt.NDArray[np.floating],
+        *,
+        orig_sr: float,
+    ) -> npt.NDArray[np.floating]:
+        if self.target_sr is None:
+            raise RuntimeError("Audio resampling is not supported when "
+                               "`target_sr` is not provided")
+        if self.method == "librosa":
+            return resample_audio_librosa(audio,
+                                          orig_sr=orig_sr,
+                                          target_sr=self.target_sr)
+        elif self.method == "scipy":
+            return resample_audio_scipy(audio,
+                                        orig_sr=orig_sr,
+                                        target_sr=self.target_sr)
+        else:
+            raise ValueError(f"Invalid resampling method: {self.method}. "
+                             "Supported methods are 'librosa' and 'scipy'.")
+class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
+        return librosa.load(BytesIO(data), sr=None)
+    def load_base64(
+        self,
+        media_type: str,
+        data: str,
+    ) -> tuple[npt.NDArray, float]:
+        return self.load_bytes(base64.b64decode(data))
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
+        return librosa.load(filepath, sr=None)
+    def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
+        audio, sr = media
+        with BytesIO() as buffer:
+            soundfile.write(buffer, audio, sr, format="WAV")
+            data = buffer.getvalue()
+        return base64.b64encode(data).decode('utf-8')

vllm/multimodal/base.py ADDED Viewed

@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from pathlib import Path
+from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
+if TYPE_CHECKING:
+    from vllm.sequence import SequenceGroupMetadata
+from .inputs import MultiModalKwargs, PlaceholderRange
+_T = TypeVar("_T")
+class MultiModalPlaceholderMap:
+    """
+    Relates multi-modal embeddings to their corresponding placeholders.
+    Note: This is only used in V0.
+    """
+    class IndexMap(NamedTuple):
+        src: list[int]
+        dest: list[int]
+    src_ranges: list[range]
+    """
+    The indices of the multi-modal embeddings that will replace the
+    corresponding placeholder embeddings pointed to by ``dest_ranges``.
+    """
+    src_len: int
+    """
+    The total number of flattened multi-modal embeddings.
+    """
+    dest_ranges: list[range]
+    """
+    The indices of the placeholder embeddings that will be replaced by the
+    multimodal embeddings.
+    """
+    dest_len: int
+    """
+    The total number of embeddings in the destination tensor.
+    """
+    def __init__(self):
+        self.src_ranges = []
+        self.src_len = 0
+        self.dest_ranges = []
+        self.dest_len = 0
+    @classmethod
+    def from_seq_group(
+        cls, seq_group: "SequenceGroupMetadata", positions: range
+    ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
+        """
+        Returns the multi-modal items that intersect with the portion of a
+        prompt (``seq_group``) represented by ``positions``, as well as a
+        ``MultiModalPlaceholderMap`` that relates the multi-modal embedding
+        vectors to their corresponding placeholders.
+        Examples:
+        ```
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |.................................|
+            images      = [A, B]
+            src_ranges  = [(0, 4), (4, 8)]
+            dest_ranges = [(0, 4), (5, 9)]
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |  .....                          |
+            images      = [A, B]
+            src_ranges  = [(2, 4), (4, 6)]
+            dest_ranges = [(0, 2), (3, 5)]
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |     .........                   |
+            images      = [B]
+            src_ranges  = [(0, 4)]
+            dest_ranges = [(0, 4)]
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |          .......................|
+            images      = []
+            src_ranges  = []
+            dest_ranges = []
+        ```
+        """
+        seq_mm_data = seq_group.multi_modal_data
+        seq_mm_placeholders = seq_group.multi_modal_placeholders
+        if not seq_mm_data or not seq_mm_placeholders:
+            return MultiModalKwargs({}), {}
+        placeholder_maps = dict[str, MultiModalPlaceholderMap]()
+        for modality, placeholders in seq_mm_placeholders.items():
+            placeholder_map = MultiModalPlaceholderMap()
+            if positions:
+                placeholder_map.append_items_from_seq_group(
+                    positions,
+                    # Dummy, since we don't care about intersecting items
+                    [None] * len(placeholders),
+                    placeholders,
+                )
+            placeholder_maps[modality] = placeholder_map
+        return seq_mm_data, placeholder_maps
+    def append_items_from_seq_group(
+        self,
+        positions: range,
+        multi_modal_items: list[_T],
+        multi_modal_placeholders: Sequence[PlaceholderRange],
+    ) -> list[_T]:
+        """
+        Adds the multi-modal items that intersect ```positions`` to this
+        placeholder map and returns the intersecting items.
+        """
+        intersecting_items = []
+        if len(multi_modal_items) != len(multi_modal_placeholders):
+            raise ValueError(
+                "Multi-modal placeholders and items must have the same length."
+            )
+        for placeholder_dict, mm_item in zip(multi_modal_placeholders,
+                                             multi_modal_items):
+            placeholder = range(
+                placeholder_dict.offset,
+                placeholder_dict.offset + placeholder_dict.length,
+            )
+            intersection = range(
+                max(positions.start, placeholder.start),
+                min(positions.stop, placeholder.stop),
+            )
+            if not intersection:
+                # Skip this multi-modal item.
+                continue
+            token_embedding_range = range(
+                intersection.start - positions.start,
+                intersection.stop - positions.start,
+            )
+            multimodal_embedding_range = range(
+                intersection.start - placeholder.start + self.src_len,
+                intersection.stop - placeholder.start + self.src_len,
+            )
+            intersecting_items.append(mm_item)
+            self.dest_ranges.append(token_embedding_range)
+            self.src_ranges.append(multimodal_embedding_range)
+            self.src_len += len(placeholder)
+        self.dest_len += len(positions)
+        return intersecting_items
+    def extend(self, other: "MultiModalPlaceholderMap"):
+        """
+        Adds the placeholders from another ``MultiModalPlaceholderMap`` to this
+        instance based on the source and destination tensors being
+        concatenated.
+        """
+        self.src_ranges.extend(
+            range(self.src_len + r.start, self.src_len + r.stop)
+            for r in other.src_ranges)
+        self.src_len += other.src_len
+        self.dest_ranges.extend(
+            range(self.dest_len + r.start, self.dest_len + r.stop)
+            for r in other.dest_ranges)
+        self.dest_len += other.dest_len
+    def index_map(self) -> "IndexMap":
+        """
+        Finalizes the placeholder map into lists of indices that can be used to
+        index the source and destination tensors.
+        """
+        src_indices = [i for r in self.src_ranges for i in r]
+        dest_indices = [i for r in self.dest_ranges for i in r]
+        if len(src_indices) != len(dest_indices):
+            raise ValueError(
+                f"The number of source ({len(src_indices)}) and destination "
+                f"indices ({len(dest_indices)}) must be the same.")
+        return self.IndexMap(src=src_indices, dest=dest_indices)
+class MediaIO(ABC, Generic[_T]):
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        raise NotImplementedError
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        """
+        List of media types:
+        https://www.iana.org/assignments/media-types/media-types.xhtml
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        raise NotImplementedError

vllm/multimodal/hasher.py ADDED Viewed

@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pickle
+from collections.abc import Iterable, Mapping
+from typing import TYPE_CHECKING, Optional
+import numpy as np
+import torch
+from blake3 import blake3
+from PIL import Image
+from vllm.logger import init_logger
+from vllm.multimodal.image import convert_image_mode
+if TYPE_CHECKING:
+    from vllm.inputs import TokensPrompt
+logger = init_logger(__name__)
+MultiModalHashDict = Mapping[str, list[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+class MultiModalHasher:
+    @classmethod
+    def serialize_item(cls, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, (int, float)):
+            return np.array(obj).tobytes()
+        if isinstance(obj, Image.Image):
+            return cls.item_to_bytes(
+                "image", np.asarray(convert_image_mode(obj, "RGBA")))
+        if isinstance(obj, torch.Tensor):
+            return cls.item_to_bytes("tensor", obj.numpy())
+        if isinstance(obj, np.ndarray):
+            return cls.item_to_bytes(
+                "ndarray", {
+                    "dtype": obj.dtype.str,
+                    "shape": obj.shape,
+                    "data": obj.tobytes(),
+                })
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+        return pickle.dumps(obj)
+    @classmethod
+    def item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> bytes:
+        return b''.join(kb + vb for kb, vb in cls.iter_item_to_bytes(key, obj))
+    @classmethod
+    def iter_item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from cls.iter_item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from cls.iter_item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = cls.serialize_item(key)
+            value_bytes = cls.serialize_item(obj)
+            yield key_bytes, value_bytes
+    @classmethod
+    def hash_kwargs(cls, **kwargs: object) -> str:
+        hasher = blake3()
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in cls.iter_item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+        return hasher.hexdigest()
+    @classmethod
+    def hash_prompt_mm_data(
+            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
+        """Hash multimodal data in the user input prompt if they exist."""
+        if "multi_modal_data" not in prompt:
+            return None
+        mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+        mm_items = {
+            modality: items if isinstance(items, list) else [items]
+            for modality, items in mm_data.items()
+        }
+        mm_hashes = {
+            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
+            for modality, items in mm_items.items()
+        }
+        return mm_hashes

vllm/multimodal/image.py ADDED Viewed

@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+from io import BytesIO
+from pathlib import Path
+import torch
+from PIL import Image
+from .base import MediaIO
+def rescale_image_size(image: Image.Image,
+                       size_factor: float,
+                       transpose: int = -1) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    image = image.resize((new_width, new_height))
+    if transpose >= 0:
+        image = image.transpose(Image.Transpose(transpose))
+    return image
+# TODO: Support customizable background color to fill in.
+def rgba_to_rgb(
+    image: Image.Image, background_color=(255, 255, 255)) -> Image.Image:
+    """Convert an RGBA image to RGB with filled background color."""
+    assert image.mode == "RGBA"
+    converted = Image.new("RGB", image.size, background_color)
+    converted.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+    return converted
+def convert_image_mode(image: Image.Image, to_mode: str):
+    if image.mode == to_mode:
+        return image
+    elif image.mode == "RGBA" and to_mode == "RGB":
+        return rgba_to_rgb(image)
+    else:
+        return image.convert(to_mode)
+class ImageMediaIO(MediaIO[Image.Image]):
+    def __init__(self, *, image_mode: str = "RGB") -> None:
+        super().__init__()
+        self.image_mode = image_mode
+    def load_bytes(self, data: bytes) -> Image.Image:
+        image = Image.open(BytesIO(data))
+        image.load()
+        return convert_image_mode(image, self.image_mode)
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        return self.load_bytes(base64.b64decode(data))
+    def load_file(self, filepath: Path) -> Image.Image:
+        image = Image.open(filepath)
+        image.load()
+        return convert_image_mode(image, self.image_mode)
+    def encode_base64(
+        self,
+        media: Image.Image,
+        *,
+        image_format: str = "JPEG",
+    ) -> str:
+        image = media
+        with BytesIO() as buffer:
+            image = convert_image_mode(image, self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+        return base64.b64encode(data).decode('utf-8')
+class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
+    def __init__(self) -> None:
+        super().__init__()
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        return torch.load(buffer, weights_only=True)
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(base64.b64decode(data))
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        return torch.load(filepath, weights_only=True)
+    def encode_base64(self, media: torch.Tensor) -> str:
+        return base64.b64encode(media.numpy()).decode('utf-8')