PyPI - vllm-cpu - Versions diffs - 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl - Mend

vllm-cpu 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1600) hide show

vllm/config/device.py ADDED Viewed

@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import field
+from typing import Any, Literal
+import torch
+from pydantic import ConfigDict, SkipValidation
+from pydantic.dataclasses import dataclass
+from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
+Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class DeviceConfig:
+    """Configuration for the device to use for vLLM execution."""
+    device: SkipValidation[Device | torch.device | None] = "auto"
+    """Device type for vLLM execution.
+    This parameter is deprecated and will be
+    removed in a future release.
+    It will now be set automatically based
+    on the current platform."""
+    device_type: str = field(init=False)
+    """Device type from the current platform. This is set in
+    `__post_init__`."""
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # the device/platform information will be summarized
+        # by torch/vllm automatically.
+        factors: list[Any] = []
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+    def __post_init__(self):
+        if self.device == "auto":
+            # Automated device type detection
+            from vllm.platforms import current_platform
+            self.device_type = current_platform.device_type
+            if not self.device_type:
+                raise RuntimeError(
+                    "Failed to infer device type, please set "
+                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
+                    "to turn on verbose logging to help debug the issue."
+                )
+        else:
+            # Device type is assigned explicitly
+            if isinstance(self.device, str):
+                self.device_type = self.device
+            elif isinstance(self.device, torch.device):
+                self.device_type = self.device.type
+        # Some device types require processing inputs on CPU
+        if self.device_type in ["tpu"]:
+            self.device = None
+        else:
+            # Set device with device type
+            self.device = torch.device(self.device_type)

vllm/config/ec_transfer.py ADDED Viewed

@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import uuid
+from dataclasses import field
+from typing import Any, Literal, get_args
+from pydantic.dataclasses import dataclass
+from vllm.config.utils import config
+ECProducer = Literal["ec_producer"]
+ECConsumer = Literal["ec_consumer"]
+ECRole = Literal[ECProducer, ECConsumer]
+@config
+@dataclass
+class ECTransferConfig:
+    """Configuration for distributed EC cache transfer."""
+    ec_connector: str | None = None
+    """The EC connector for vLLM to transmit EC caches between vLLM instances.
+    """
+    engine_id: str | None = None
+    """The engine id for EC transfers."""
+    ec_buffer_device: str | None = "cuda"
+    """The device used by ec connector to buffer the EC cache.
+    Currently only support 'cuda'."""
+    ec_buffer_size: float = 1e9
+    """The buffer size for TorchDistributedConnector. Measured in number of
+    bytes. Recommended value: 1e9 (about 1GB)."""
+    ec_role: ECRole | None = None
+    """Whether this vLLM instance produces, consumes EC cache, or both. Choices
+    are 'ec_producer', 'ec_consumer'."""
+    ec_rank: int | None = None
+    """The rank of this vLLM instance in the EC cache transfer. Typical value:
+    0 for encoder, 1 for pd instance.
+    Currently only 1P1D is supported."""
+    ec_parallel_size: int = 1
+    """The number of parallel instances for EC cache transfer. For
+    PyNcclConnector, this should be 2."""
+    ec_ip: str = "127.0.0.1"
+    """The EC connector ip, used to build distributed connection."""
+    ec_port: int = 14579
+    """The EC connector port, used to build distributed connection."""
+    ec_connector_extra_config: dict[str, Any] = field(default_factory=dict)
+    """any extra config that the connector may need."""
+    ec_connector_module_path: str | None = None
+    """The Python module path to dynamically load the EC connector from.
+    Only supported in V1."""
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+    def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+        if self.ec_role is not None and self.ec_role not in get_args(ECRole):
+            raise ValueError(
+                f"Unsupported ec_role: {self.ec_role}. "
+                f"Supported roles are {get_args(ECRole)}"
+            )
+        if self.ec_connector is not None and self.ec_role is None:
+            raise ValueError(
+                "Please specify ec_role when ec_connector "
+                f"is set, supported roles are {get_args(ECRole)}"
+            )
+    @property
+    def is_ec_transfer_instance(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECRole)
+    @property
+    def is_ec_producer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECProducer)
+    @property
+    def is_ec_consumer(self) -> bool:
+        return self.ec_connector is not None and self.ec_role in get_args(ECConsumer)
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.ec_connector_extra_config.get(key, default)

vllm/config/kv_events.py ADDED Viewed

@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Literal
+from pydantic import Field
+from pydantic.dataclasses import dataclass
+from vllm.config.utils import config
+@config
+@dataclass
+class KVEventsConfig:
+    """Configuration for KV event publishing."""
+    enable_kv_cache_events: bool = False
+    """If True, enable KV cache events for tracking block storage and removal.
+    Events can be published externally by zmq using the event publisher config.
+    """
+    publisher: Literal["null", "zmq"] = Field(default=None)
+    """The publisher to use for publishing kv events. Can be "null", "zmq".
+    """
+    endpoint: str = "tcp://*:5557"
+    """The zmq endpoint to use for publishing kv events.
+    """
+    replay_endpoint: str | None = None
+    """The zmq endpoint to use for replaying kv events.
+    """
+    buffer_steps: int = 10_000
+    """The number of steps to cache for replay endpoint. Will only save
+    events from the last N steps for the replay endpoint.
+    """
+    hwm: int = 100_000
+    """The zmq high water mark for the event publisher. After queueing N events,
+    events will start dropping if the consumer is not keeping up.
+    """
+    max_queue_size: int = 100_000
+    """The maximum number of events to queue while waiting for publishing.
+    """
+    topic: str = ""
+    """The topic to use for the event publisher. Consumers can subscribe to
+    this topic to receive events.
+    """
+    def __post_init__(self):
+        if self.publisher is None:
+            self.publisher = "zmq" if self.enable_kv_cache_events else "null"

vllm/config/kv_transfer.py ADDED Viewed

@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import uuid
+from dataclasses import field
+from typing import Any, Literal, get_args
+from pydantic.dataclasses import dataclass
+from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
+KVProducer = Literal["kv_producer", "kv_both"]
+KVConsumer = Literal["kv_consumer", "kv_both"]
+KVRole = Literal[KVProducer, KVConsumer]
+@config
+@dataclass
+class KVTransferConfig:
+    """Configuration for distributed KV cache transfer."""
+    kv_connector: str | None = None
+    """The KV connector for vLLM to transmit KV caches between vLLM instances.
+    """
+    engine_id: str | None = None
+    """The engine id for KV transfers."""
+    kv_buffer_device: str = "cuda"
+    """The device used by kv connector to buffer the KV cache. Choices are
+    'cuda' and 'cpu'."""
+    kv_buffer_size: float = 1e9
+    """The buffer size for TorchDistributedConnector. Measured in number of
+    bytes. Recommended value: 1e9 (about 1GB)."""
+    kv_role: KVRole | None = None
+    """Whether this vLLM instance produces, consumes KV cache, or both. Choices
+    are 'kv_producer', 'kv_consumer', and 'kv_both'."""
+    kv_rank: int | None = None
+    """The rank of this vLLM instance in the KV cache transfer. Typical value:
+    0 for prefill instance, 1 for decode instance.
+    Currently only 1P1D is supported."""
+    kv_parallel_size: int = 1
+    """The number of parallel instances for KV cache transfer. For
+    P2pNcclConnector, this should be 2."""
+    kv_ip: str = "127.0.0.1"
+    """The KV connector ip, used to build distributed connection."""
+    kv_port: int = 14579
+    """The KV connector port, used to build distributed connection."""
+    kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
+    """any extra config that the connector may need."""
+    kv_connector_module_path: str | None = None
+    """The Python module path to dynamically load the KV connector from.
+    Only supported in V1."""
+    enable_permute_local_kv: bool = False
+    """Experiment feature flag to enable HND to NHD KV Transfer"""
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+    def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+        if self.kv_role is not None and self.kv_role not in get_args(KVRole):
+            raise ValueError(
+                f"Unsupported kv_role: {self.kv_role}. "
+                f"Supported roles are {get_args(KVRole)}"
+            )
+        if self.kv_connector is not None and self.kv_role is None:
+            raise ValueError(
+                "Please specify kv_role when kv_connector "
+                f"is set, supported roles are {get_args(KVRole)}"
+            )
+    @property
+    def is_kv_transfer_instance(self) -> bool:
+        return self.kv_connector is not None and self.kv_role in get_args(KVRole)
+    @property
+    def is_kv_producer(self) -> bool:
+        return self.kv_connector is not None and self.kv_role in get_args(KVProducer)
+    @property
+    def is_kv_consumer(self) -> bool:
+        return self.kv_connector is not None and self.kv_role in get_args(KVConsumer)
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.kv_connector_extra_config.get(key, default)

vllm/config/load.py ADDED Viewed

@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Any
+from pydantic import Field, field_validator
+from pydantic.dataclasses import dataclass
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
+if TYPE_CHECKING:
+    from vllm.model_executor.model_loader import LoadFormats
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+else:
+    LoadFormats = Any
+    TensorizerConfig = Any
+logger = init_logger(__name__)
+@config
+@dataclass
+class LoadConfig:
+    """Configuration for loading the model weights."""
+    load_format: str | LoadFormats = "auto"
+    """The format of the model weights to load:\n
+    - "auto" will try to load the weights in the safetensors format and fall
+    back to the pytorch bin format if safetensors format is not available.\n
+    - "pt" will load the weights in the pytorch bin format.\n
+    - "safetensors" will load the weights in the safetensors format.\n
+    - "npcache" will load the weights in pytorch format and store a numpy cache
+    to speed up the loading.\n
+    - "dummy" will initialize the weights with random values, which is mainly
+    for profiling.\n
+    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
+    loading. See the Tensorize vLLM Model script in the Examples section for
+    more information.\n
+    - "runai_streamer" will load the Safetensors weights using Run:ai Model
+    Streamer.\n
+    - "runai_streamer_sharded" will load weights from pre-sharded checkpoint
+    files using Run:ai Model Streamer.\n
+    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
+    - "sharded_state" will load weights from pre-sharded checkpoint files,
+    supporting efficient loading of tensor-parallel models.\n
+    - "gguf" will load weights from GGUF format files (details specified in
+    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
+    - "mistral" will load weights from consolidated safetensors files used by
+    Mistral models.
+    - Other custom values can be supported via plugins."""
+    download_dir: str | None = None
+    """Directory to download and load the weights, default to the default
+    cache directory of Hugging Face."""
+    safetensors_load_strategy: str = "lazy"
+    """Specifies the loading strategy for safetensors weights.
+    - "lazy" (default): Weights are memory-mapped from the file. This enables
+      on-demand loading and is highly efficient for models on local storage.
+    - "eager": The entire file is read into CPU memory upfront before loading.
+      This is recommended for models on network filesystems (e.g., Lustre, NFS)
+      as it avoids inefficient random reads, significantly speeding up model
+      initialization. However, it uses more CPU RAM.
+    - "torchao": Weights are loaded in upfront and then reconstructed
+      into torchao tensor subclasses. This is used when the checkpoint
+      was quantized using torchao and saved using safetensors.
+      Needs torchao >= 0.14.0
+    """
+    model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
+    """Extra config for model loader. This will be passed to the model loader
+    corresponding to the chosen load_format."""
+    device: str | None = None
+    """Device to which model weights will be loaded, default to
+    device_config.device"""
+    ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
+    """The list of patterns to ignore when loading the model. Default to
+    "original/**/*" to avoid repeated loading of llama's checkpoints."""
+    use_tqdm_on_load: bool = True
+    """Whether to enable tqdm for showing progress bar when loading model
+    weights."""
+    pt_load_map_location: str | dict[str, str] = "cpu"
+    """
+    pt_load_map_location: the map location for loading pytorch checkpoint, to
+    support loading checkpoints can only be loaded on certain devices like
+    "cuda", this is equivalent to {"": "cuda"}. Another supported format is
+    mapping from different devices like from GPU 1 to GPU 0:
+    {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
+    in dictionary needs to be double quoted for json parsing. For more details,
+    see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
+    """
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+    @field_validator("load_format", mode="after")
+    def _lowercase_load_format(cls, load_format: str) -> str:
+        return load_format.lower()
+    @field_validator("ignore_patterns", mode="after")
+    def _validate_ignore_patterns(
+        cls, ignore_patterns: list[str] | str
+    ) -> list[str] | str:
+        if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                ignore_patterns,
+            )
+        return ignore_patterns

vllm/config/lora.py ADDED Viewed

@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Any, Literal
+import torch
+from pydantic import ConfigDict, Field, model_validator
+from pydantic.dataclasses import dataclass
+from typing_extensions import Self
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.config.cache import CacheConfig
+else:
+    ModelConfig = Any
+    CacheConfig = Any
+logger = init_logger(__name__)
+LoRADType = Literal["auto", "float16", "bfloat16"]
+MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
+LoRAExtraVocabSize = Literal[256, 512]
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class LoRAConfig:
+    """Configuration for LoRA."""
+    max_lora_rank: MaxLoRARanks = 16
+    """Max LoRA rank."""
+    max_loras: int = Field(default=1, ge=1)
+    """Max number of LoRAs in a single batch."""
+    fully_sharded_loras: bool = False
+    """By default, only half of the LoRA computation is sharded with tensor
+    parallelism. Enabling this will use the fully sharded layers. At high
+    sequence length, max rank or tensor parallel size, this is likely faster.
+    """
+    max_cpu_loras: int | None = None
+    """Maximum number of LoRAs to store in CPU memory. Must be >= than
+    `max_loras`."""
+    lora_dtype: torch.dtype | LoRADType = "auto"
+    """Data type for LoRA. If auto, will default to base model dtype."""
+    default_mm_loras: dict[str, str] | None = None
+    """Dictionary mapping specific modalities to LoRA model paths; this field
+    is only applicable to multimodal models and should be leveraged when a
+    model always expects a LoRA to be active when a given modality is present.
+    Note that currently, if a request provides multiple additional
+    modalities, each of which have their own LoRA, we do NOT apply
+    default_mm_loras because we currently only support one lora adapter
+    per prompt. When run in offline mode, the lora IDs for n modalities
+    will be automatically assigned to 1-n with the names of the modalities
+    in alphabetic order."""
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.max_lora_rank)
+        factors.append(self.max_loras)
+        factors.append(self.fully_sharded_loras)
+        factors.append(self.lora_dtype)
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
+        return hash_str
+    @model_validator(mode="after")
+    def _validate_lora_config(self) -> Self:
+        if self.max_cpu_loras is None:
+            self.max_cpu_loras = self.max_loras
+        elif self.max_cpu_loras < self.max_loras:
+            raise ValueError(
+                f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
+                f"max_loras ({self.max_loras})"
+            )
+        return self
+    def verify_with_model_config(self, model_config: ModelConfig):
+        if self.lora_dtype in (None, "auto"):
+            self.lora_dtype = model_config.dtype
+        elif isinstance(self.lora_dtype, str):
+            self.lora_dtype = getattr(torch, self.lora_dtype)