PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/config/utils.py ADDED Viewed

@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import inspect
+import textwrap
+from dataclasses import MISSING, Field, field, fields, is_dataclass
+from typing import TYPE_CHECKING, Any, TypeVar
+import regex as re
+if TYPE_CHECKING:
+    from _typeshed import DataclassInstance
+    ConfigType = type[DataclassInstance]
+else:
+    ConfigType = type
+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+def config(cls: ConfigT) -> ConfigT:
+    """
+    A decorator that ensures all fields in a dataclass have default values
+    and that each field has a docstring.
+    If a `ConfigT` is used as a CLI argument itself, the `type` keyword argument
+    provided by `get_kwargs` will be
+    `pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the
+    `cli_arg` as a JSON string which gets validated by `pydantic`.
+    Config validation is performed by the tools/validate_config.py
+    script, which is invoked during the pre-commit checks.
+    """
+    return cls
+def get_field(cls: ConfigType, name: str) -> Field:
+    """Get the default factory field of a dataclass by name. Used for getting
+    default factory fields in `EngineArgs`."""
+    if not is_dataclass(cls):
+        raise TypeError("The given class is not a dataclass.")
+    cls_fields = {f.name: f for f in fields(cls)}
+    if name not in cls_fields:
+        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
+    named_field: Field = cls_fields[name]
+    if (default_factory := named_field.default_factory) is not MISSING:
+        return field(default_factory=default_factory)
+    if (default := named_field.default) is not MISSING:
+        return field(default=default)
+    raise ValueError(
+        f"{cls.__name__}.{name} must have a default value or default factory.")
+def contains_object_print(text: str) -> bool:
+    """
+    Check if the text looks like a printed Python object, e.g.
+    contains any substring matching the pattern: "at 0xFFFFFFF>"
+    We match against 0x followed by 2-16 hex chars (there's
+    a max of 16 on a 64-bit system).
+    Args:
+        text (str): The text to check
+    Returns:
+        result (bool): `True` if a match is found, `False` otherwise.
+    """
+    pattern = r'at 0x[a-fA-F0-9]{2,16}>'
+    match = re.search(pattern, text)
+    return match is not None
+def assert_hashable(text: str) -> bool:
+    if not contains_object_print(text):
+        return True
+    raise AssertionError(
+        f"vLLM tried to hash some configs that may have Python objects ids "
+        f"in them. This is a bug, please file an issue. "
+        f"Text being hashed: {text}")
+def get_attr_docs(cls: type[Any]) -> dict[str, str]:
+    """
+    Get any docstrings placed after attribute assignments in a class body.
+    https://davidism.com/mit-license/
+    """
+    def pairwise(iterable):
+        """
+        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
+        Can be removed when Python 3.9 support is dropped.
+        """
+        iterator = iter(iterable)
+        a = next(iterator, None)
+        for b in iterator:
+            yield a, b
+            a = b
+    try:
+        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    except (OSError, KeyError, TypeError):
+        # HACK: Python 3.13+ workaround - set missing __firstlineno__
+        # Workaround can be removed after we upgrade to pydantic==2.12.0
+        with open(inspect.getfile(cls)) as f:
+            for i, line in enumerate(f):
+                if f"class {cls.__name__}" in line and ":" in line:
+                    cls.__firstlineno__ = i + 1
+                    break
+        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    if not isinstance(cls_node, ast.ClassDef):
+        raise TypeError("Given object was not a class.")
+    out = {}
+    # Consider each pair of nodes.
+    for a, b in pairwise(cls_node.body):
+        # Must be an assignment then a constant string.
+        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
+                or not isinstance(b, ast.Expr)
+                or not isinstance(b.value, ast.Constant)
+                or not isinstance(b.value.value, str)):
+            continue
+        doc = inspect.cleandoc(b.value.value)
+        # An assignment can have multiple targets (a = b = v), but an
+        # annotated assignment only has one target.
+        targets = a.targets if isinstance(a, ast.Assign) else [a.target]
+        for target in targets:
+            # Must be assigning to a plain name.
+            if not isinstance(target, ast.Name):
+                continue
+            out[target.id] = doc
+    return out
+def is_init_field(cls: ConfigType, name: str) -> bool:
+    return next(f for f in fields(cls) if f.name == name).init

vllm/connections.py ADDED Viewed

@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Mapping, MutableMapping
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+import aiohttp
+import requests
+from vllm.version import __version__ as VLLM_VERSION
+class HTTPConnection:
+    """Helper class to send HTTP requests."""
+    def __init__(self, *, reuse_client: bool = True) -> None:
+        super().__init__()
+        self.reuse_client = reuse_client
+        self._sync_client: Optional[requests.Session] = None
+        self._async_client: Optional[aiohttp.ClientSession] = None
+    def get_sync_client(self) -> requests.Session:
+        if self._sync_client is None or not self.reuse_client:
+            self._sync_client = requests.Session()
+        return self._sync_client
+    # NOTE: We intentionally use an async function even though it is not
+    # required, so that the client is only accessible inside async event loop
+    async def get_async_client(self) -> aiohttp.ClientSession:
+        if self._async_client is None or not self.reuse_client:
+            self._async_client = aiohttp.ClientSession(trust_env=True)
+        return self._async_client
+    def _validate_http_url(self, url: str):
+        parsed_url = urlparse(url)
+        if parsed_url.scheme not in ("http", "https"):
+            raise ValueError("Invalid HTTP URL: A valid HTTP URL "
+                             "must have scheme 'http' or 'https'.")
+    def _headers(self, **extras: str) -> MutableMapping[str, str]:
+        return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
+    def get_response(
+        self,
+        url: str,
+        *,
+        stream: bool = False,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+        allow_redirects: bool = True,
+    ):
+        self._validate_http_url(url)
+        client = self.get_sync_client()
+        extra_headers = extra_headers or {}
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          stream=stream,
+                          timeout=timeout,
+                          allow_redirects=allow_redirects)
+    async def get_async_response(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+        extra_headers: Optional[Mapping[str, str]] = None,
+        allow_redirects: bool = True,
+    ):
+        self._validate_http_url(url)
+        client = await self.get_async_client()
+        extra_headers = extra_headers or {}
+        return client.get(url,
+                          headers=self._headers(**extra_headers),
+                          timeout=timeout,
+                          allow_redirects=allow_redirects)
+    def get_bytes(self,
+                  url: str,
+                  *,
+                  timeout: Optional[float] = None,
+                  allow_redirects: bool = True) -> bytes:
+        with self.get_response(url,
+                               timeout=timeout,
+                               allow_redirects=allow_redirects) as r:
+            r.raise_for_status()
+            return r.content
+    async def async_get_bytes(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+        allow_redirects: bool = True,
+    ) -> bytes:
+        async with await self.get_async_response(
+                url, timeout=timeout, allow_redirects=allow_redirects) as r:
+            r.raise_for_status()
+            return await r.read()
+    def get_text(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return r.text
+    async def async_get_text(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return await r.text()
+    def get_json(self, url: str, *, timeout: Optional[float] = None) -> str:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return r.json()
+    async def async_get_json(
+        self,
+        url: str,
+        *,
+        timeout: Optional[float] = None,
+    ) -> str:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            return await r.json()
+    def download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        with self.get_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            with save_path.open("wb") as f:
+                for chunk in r.iter_content(chunk_size):
+                    f.write(chunk)
+        return save_path
+    async def async_download_file(
+        self,
+        url: str,
+        save_path: Path,
+        *,
+        timeout: Optional[float] = None,
+        chunk_size: int = 128,
+    ) -> Path:
+        async with await self.get_async_response(url, timeout=timeout) as r:
+            r.raise_for_status()
+            with save_path.open("wb") as f:
+                async for chunk in r.content.iter_chunked(chunk_size):
+                    f.write(chunk)
+        return save_path
+global_http_connection = HTTPConnection()
+"""
+The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used
+by vLLM.
+"""

vllm/device_allocator/__init__.py ADDED Viewed

File without changes

vllm/device_allocator/cumem.py ADDED Viewed

@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# cumem-based pytorch pluggable allocator to implement sleep mode.
+# other approaches tried but failed:
+# - cuda-python package binding
+# - custom libcuda driver ctypes wrapper
+# both of them failed because of cuda context mismatch.
+# not sure why, they are created from a different context.
+# the only successful approach is to call cuda driver API in C.
+import dataclasses
+import gc
+import os
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Union
+import torch
+from vllm.logger import init_logger
+from vllm.utils import is_pin_memory_available
+logger = init_logger(__name__)
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found_line = None
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found_line = line
+                break
+    if found_line is None:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = found_line.index("/")
+    path = found_line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+cumem_available = False
+try:
+    from vllm.cumem_allocator import (init_module, python_create_and_map,
+                                      python_unmap_and_release)
+    from vllm.distributed.device_communicators.cuda_wrapper import (
+        CudaRTLibrary)
+    lib_name = find_loaded_library("cumem_allocator")
+    libcudart = CudaRTLibrary()
+    cumem_available = True
+except ModuleNotFoundError:
+    # rocm platform does not support cumem allocator
+    init_module = None
+    python_create_and_map = None
+    python_unmap_and_release = None
+    CudaRTLibrary = None
+    lib_name = None
+    libcudart = None
+# py_device, py_alignedSize, py_d_mem, py_p_memHandle
+HandleType = tuple[int, int, int, int]
+@dataclasses.dataclass
+class AllocationData:
+    handle: HandleType
+    tag: str
+    cpu_backup_tensor: Optional[torch.Tensor] = None
+def create_and_map(allocation_handle: HandleType) -> None:
+    python_create_and_map(*allocation_handle)
+def unmap_and_release(allocation_handle: HandleType) -> None:
+    python_unmap_and_release(*allocation_handle)
+def get_pluggable_allocator(
+    python_malloc_fn: Callable[[int],
+                               int], python_free_func: Callable[[int, int],
+                                                                None]
+) -> torch.cuda.memory.CUDAPluggableAllocator:
+    init_module(python_malloc_fn, python_free_func)
+    new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
+        lib_name, 'my_malloc', 'my_free')
+    return new_alloc
+@contextmanager
+def use_memory_pool_with_allocator(
+        python_malloc_fn: Callable[[int], int],
+        python_free_func: Callable[[int, int], None]) -> None:
+    new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
+    mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator)
+    with torch.cuda.memory.use_mem_pool(mem_pool):
+        yield mem_pool, new_alloc
+class CuMemAllocator:
+    """
+    A singleton class that manages a memory pool for CUDA tensors.
+    The memory in this pool can be offloaded or discarded when the
+    allocator sleeps.
+    Inside the `use_memory_pool(tag)` context, all tensors created will
+    be allocated in the memory pool, and has the same tag as the
+    tag passed to the context.
+    When we call `sleep`, all tensors with the specified tag will be
+    offloaded to CPU memory, and the rest of the tensors will be discarded.
+    When we call `wake_up`, all tensors that are previously offloaded
+    will be loaded back to GPU memory, and the rest of the tensors will
+    have empty memory.
+    Why it needs to be a singleton?
+    When allocated tensors are garbage collected, PyTorch will call
+    the free callback, which will call the `python_free_callback` method.
+    The C-extension uses a global variable to store the function of an
+    instance of this class. If we create multiple instances of this class,
+    the global variable will be overwritten and the free callback will
+    not work as expected.
+    """
+    instance: "CuMemAllocator" = None
+    default_tag: str = "default"
+    @staticmethod
+    def get_instance() -> "CuMemAllocator":
+        """
+        CuMemAllocator is a singleton class.
+        We cannot call the constructor directly.
+        Call this method to get the instance.
+        """
+        assert cumem_available, "cumem allocator is not available"
+        if CuMemAllocator.instance is None:
+            CuMemAllocator.instance = CuMemAllocator()
+        return CuMemAllocator.instance
+    def __init__(self):
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        assert "expandable_segments:True" not in conf, \
+            ("Expandable segments are not compatible with memory pool. "
+            "Please track https://github.com/pytorch/pytorch/issues/147851 "
+            "for the latest updates.")
+        self.pointer_to_data: dict[int, AllocationData] = {}
+        self.current_tag: str = CuMemAllocator.default_tag
+        self.allocator_and_pools: dict[str, Any] = {}
+        # Creating strong references to the two callbacks here to prevent
+        # these ephemeral bound-method objects being garbage collected.
+        # See discussions in https://github.com/vllm-project/vllm/pull/22724
+        self.python_malloc_callback = self._python_malloc_callback
+        self.python_free_callback = self._python_free_callback
+    def _python_malloc_callback(self, allocation_handle: HandleType) -> None:
+        """
+        Internal method to store the allocation data
+        when memory is allocated in the memory pool."""
+        py_d_mem = allocation_handle[2]
+        self.pointer_to_data[py_d_mem] = AllocationData(
+            allocation_handle, self.current_tag)
+        logger.debug(
+            "Allocated %s bytes for %s with address %s from cumem allocator",
+            allocation_handle[1], self.current_tag, py_d_mem)
+        return
+    def _python_free_callback(self, ptr: int) -> HandleType:
+        """
+        Internal method to look up the allocation data
+        when memory is freed in the memory pool."""
+        data = self.pointer_to_data.pop(ptr)
+        if data.cpu_backup_tensor is not None:
+            data.cpu_backup_tensor = None
+        logger.debug(
+            "Freed %s bytes for %s with address %s from cumem allocator",
+            data.handle[1], data.tag, ptr)
+        return data.handle
+    def sleep(
+            self,
+            offload_tags: Optional[Union[tuple[str, ...],
+                                         str]] = None) -> None:
+        """
+        Put the allocator in sleep mode.
+        All data in the memory allocation with the specified tag will be
+        offloaded to CPU memory, and others will be discarded.
+        :param offload_tags: The tags of the memory allocation that will be
+            offloaded. The rest of the memory allocation will be discarded.
+        """
+        if offload_tags is None:
+            # by default, allocated tensors are offloaded
+            # when the allocator sleeps
+            offload_tags = (CuMemAllocator.default_tag, )
+        elif isinstance(offload_tags, str):
+            offload_tags = (offload_tags, )
+        assert isinstance(offload_tags, tuple)
+        total_bytes = 0
+        backup_bytes = 0
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            total_bytes += handle[1]
+            if data.tag in offload_tags:
+                backup_bytes += handle[1]
+                size_in_bytes = handle[1]
+                cpu_backup_tensor = torch.empty(
+                    size_in_bytes,
+                    dtype=torch.uint8,
+                    device='cpu',
+                    pin_memory=is_pin_memory_available())
+                cpu_ptr = cpu_backup_tensor.data_ptr()
+                libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes)
+                data.cpu_backup_tensor = cpu_backup_tensor
+            unmap_and_release(handle)
+        logger.info(
+            "CuMemAllocator: sleep freed %.2f GiB memory in total, of which "
+            "%.2f GiB is backed up in CPU and the rest %.2f GiB is discarded "
+            "directly.", total_bytes / 1024**3, backup_bytes / 1024**3,
+            (total_bytes - backup_bytes) / 1024**3)
+        gc.collect()
+        torch.cuda.empty_cache()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        """
+        Wake up the allocator from sleep mode.
+        All data that is previously offloaded will be loaded back to GPU
+        memory, and the rest of the data will have empty memory.
+        :param tags: The tags of the memory allocation that will be loaded
+            back to GPU memory. If None, all memory allocation will be loaded
+            back to GPU memory.
+        """
+        for ptr, data in self.pointer_to_data.items():
+            if tags is None or data.tag in tags:
+                handle = data.handle
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    if cpu_backup_tensor is not None:
+                        size_in_bytes = cpu_backup_tensor.numel(
+                        ) * cpu_backup_tensor.element_size()
+                        cpu_ptr = cpu_backup_tensor.data_ptr()
+                        libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                        data.cpu_backup_tensor = None
+    @contextmanager
+    def use_memory_pool(self, tag: Optional[str] = None):
+        """
+        A context manager to use the memory pool.
+        All memory allocation created inside the context will be allocated
+        in the memory pool, and has the specified tag.
+        :param tag: The tag of the memory allocation. If None, the default tag
+            will be used.
+        """
+        if tag is None:
+            tag = CuMemAllocator.default_tag
+        assert isinstance(tag, str)
+        old_tag = self.current_tag
+        self.current_tag = tag
+        with use_memory_pool_with_allocator(self.python_malloc_callback,
+                                            self.python_free_callback) as data:
+            # start to hit another PyTorch bug in PyTorch 2.6,
+            # possibly because of gc-related issue w.r.t. the allocator and
+            # the memory pool.
+            # to avoid the issue, we keep a reference of the data.
+            # see https://github.com/pytorch/pytorch/issues/146431 .
+            self.allocator_and_pools[tag] = data
+            yield
+            # PyTorch's bug, calling torch.cuda.empty_cache() will error
+            # when using pluggable allocator, see
+            # https://github.com/pytorch/pytorch/issues/145168 .
+            # if we have some memory allocated and then freed,
+            # the memory will not be released, e.g. in online quantization,
+            # where the model is created in higher precision, and then
+            # quantized in lower precision.
+            # Find all unused allocations and manually release them.
+            # TODO: we should expose `empty_cache` method in the memory pool.
+            # TODO: ask for help from PyTorch team to expose this method.
+            allocations = data[0].snapshot()
+            for allocation in allocations:
+                if allocation["allocated_size"] == 0:
+                    handle = self._python_free_callback(allocation["address"])
+                    unmap_and_release(handle)
+            self.current_tag = old_tag
+    def get_current_usage(self) -> int:
+        """
+        Get the total number of bytes allocated in the memory pool.
+        """
+        sum_bytes: int = 0
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            sum_bytes += handle[1]
+        return sum_bytes

vllm/distributed/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .communication_op import *
+from .parallel_state import *
+from .utils import *

vllm/distributed/communication_op.py ADDED Viewed

@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional, Union
+import torch
+import torch.distributed
+from .parallel_state import get_tp_group
+def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_)
+def tensor_model_parallel_all_gather(input_: torch.Tensor,
+                                     dim: int = -1) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    return get_tp_group().all_gather(input_, dim)
+def tensor_model_parallel_reduce_scatter(input_: torch.Tensor,
+                                         dim: int = -1) -> torch.Tensor:
+    """Reduce-Scatter the input tensor across model parallel group."""
+    return get_tp_group().reduce_scatter(input_, dim)
+def tensor_model_parallel_gather(input_: torch.Tensor,
+                                 dst: int = 0,
+                                 dim: int = -1) -> Optional[torch.Tensor]:
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)
+def broadcast_tensor_dict(tensor_dict: Optional[dict[Any, Union[torch.Tensor,
+                                                                Any]]] = None,
+                          src: int = 0):
+    if not torch.distributed.is_initialized():
+        return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)

vllm/distributed/device_communicators/__init__.py ADDED Viewed

File without changes