PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/utils/profiling.py ADDED Viewed

@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+import contextlib
+from collections.abc import Callable
+from functools import wraps
+from typing import Any
+@contextlib.contextmanager
+def cprofile_context(save_file: str | None = None):
+    """Run a cprofile
+    Args:
+        save_file: path to save the profile result. "1" or
+            None will result in printing to stdout.
+    """
+    import cProfile
+    prof = cProfile.Profile()
+    prof.enable()
+    try:
+        yield
+    finally:
+        prof.disable()
+        if save_file and save_file != "1":
+            prof.dump_stats(save_file)
+        else:
+            prof.print_stats(sort="cumtime")
+def cprofile(save_file: str | None = None, enabled: bool = True):
+    """Decorator to profile a Python method using cProfile.
+    Args:
+        save_file: Path to save the profile result.
+            If "1", None, or "", results will be printed to stdout.
+        enabled: Set to false to turn this into a no-op
+    """
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any):
+            if not enabled:
+                # If profiling is disabled, just call the function directly.
+                return func(*args, **kwargs)
+            with cprofile_context(save_file):
+                return func(*args, **kwargs)
+        return wrapper
+    return decorator

vllm/utils/registry.py ADDED Viewed

@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+class ExtensionManager:
+    """
+    A registry for managing pluggable extension classes.
+    This class provides a simple mechanism to register and instantiate
+    extension classes by name. It is commonly used to implement plugin
+    systems where different implementations can be swapped at runtime.
+    Examples:
+        Basic usage with a registry instance:
+        >>> FOO_REGISTRY = ExtensionManager()
+        >>> @FOO_REGISTRY.register("my_foo_impl")
+        ... class MyFooImpl(Foo):
+        ...     def __init__(self, value):
+        ...         self.value = value
+        >>> foo_impl = FOO_REGISTRY.load("my_foo_impl", value=123)
+    """
+    def __init__(self) -> None:
+        """
+        Initialize an empty extension registry.
+        """
+        self.name2class: dict[str, type] = {}
+    def register(self, name: str):
+        """
+        Decorator to register a class with the given name.
+        """
+        def wrap(cls_to_register):
+            self.name2class[name] = cls_to_register
+            return cls_to_register
+        return wrap
+    def load(self, cls_name: str, *args, **kwargs) -> Any:
+        """
+        Instantiate and return a registered extension class by name.
+        """
+        cls = self.name2class.get(cls_name)
+        assert cls is not None, f"Extension class {cls_name} not found"
+        return cls(*args, **kwargs)

vllm/utils/serial_utils.py ADDED Viewed

@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+import sys
+from dataclasses import dataclass
+from typing import Literal
+import numpy as np
+import torch
+from typing_extensions import assert_never
+from vllm import PoolingRequestOutput
+sys_byteorder = sys.byteorder
+EMBED_DTYPE_TO_TORCH_DTYPE = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    # I'm not sure if other platforms' CPUs support the fp8 data format.
+    # EMBED_DTYPE only uses the fp8 data representation,
+    # does not use fp8 computation, and only occurs on the CPU.
+    # Apologize for any possible break.
+    "fp8_e4m3": torch.float8_e4m3fn,
+    "fp8_e5m2": torch.float8_e5m2,
+}
+EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    # numpy does not support bfloat16 and fp8
+    "bfloat16": torch.float16,
+    "fp8_e4m3": torch.uint8,
+    "fp8_e5m2": torch.uint8,
+}
+EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW = {
+    "float32": np.float32,
+    "float16": np.float16,
+    # numpy does not support bfloat16 and fp8
+    "bfloat16": np.float16,
+    "fp8_e4m3": np.uint8,
+    "fp8_e5m2": np.uint8,
+}
+ENDIANNESS = ["native", "big", "little"]
+EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"]
+Endianness = Literal["native", "big", "little"]
+EncodingFormat = Literal["float", "base64", "bytes"]
+def tensor2binary(
+    tensor: torch.Tensor, embed_dtype: EmbedDType, endianness: Endianness
+) -> bytes:
+    assert isinstance(tensor, torch.Tensor)
+    assert embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE
+    assert endianness in ENDIANNESS
+    torch_dtype = EMBED_DTYPE_TO_TORCH_DTYPE[embed_dtype]
+    torch_view_dtype = EMBED_DTYPE_TO_TORCH_DTYPE_VIEW[embed_dtype]
+    np_array = (
+        tensor.to(torch_dtype).flatten().contiguous().view(torch_view_dtype).numpy()
+    )
+    if endianness != "native" and endianness != sys_byteorder:
+        np_array = np_array.byteswap()
+    return np_array.tobytes()
+def binary2tensor(
+    binary: bytes,
+    shape: tuple[int, ...],
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+) -> torch.Tensor:
+    assert embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE
+    assert embed_dtype in EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW
+    assert endianness in ENDIANNESS
+    torch_dtype = EMBED_DTYPE_TO_TORCH_DTYPE[embed_dtype]
+    np_dtype = EMBED_DTYPE_TO_NUMPY_DTYPE_VIEW[embed_dtype]
+    np_array = np.frombuffer(binary, dtype=np_dtype).reshape(shape)
+    if endianness != "native" and endianness != sys_byteorder:
+        np_array = np_array.byteswap()
+    return torch.from_numpy(np_array).view(torch_dtype)
+def encode_pooling_output(
+    output: PoolingRequestOutput,
+    encoding_format: EncodingFormat,
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+) -> list[float] | str | bytes:
+    if encoding_format == "float":
+        return output.outputs.data.tolist()
+    elif encoding_format == "base64":
+        embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
+        return base64.b64encode(embedding_bytes).decode("utf-8")
+    elif encoding_format == "bytes":
+        return tensor2binary(output.outputs.data, embed_dtype, endianness)
+    assert_never(encoding_format)
+@dataclass
+class MetadataItem:
+    index: int
+    embed_dtype: EmbedDType
+    endianness: Endianness
+    start: int
+    end: int
+    shape: tuple[int, ...]
+def encode_pooling_bytes(
+    pooling_outputs: list[PoolingRequestOutput],
+    embed_dtype: EmbedDType,
+    endianness: Endianness,
+):
+    num_prompt_tokens = 0
+    items: list[dict[str, MetadataItem]] = []
+    body = []
+    offset = 0
+    for idx, output in enumerate(pooling_outputs):
+        binary = tensor2binary(
+            tensor=output.outputs.data,
+            embed_dtype=embed_dtype,
+            endianness=endianness,
+        )
+        size = len(binary)
+        item = {
+            "index": idx,
+            "embed_dtype": embed_dtype,
+            "endianness": endianness,
+            "start": offset,
+            "end": offset + size,
+            "shape": output.outputs.data.shape,
+        }
+        body.append(binary)
+        items.append(item)
+        prompt_token_ids = output.prompt_token_ids
+        num_prompt_tokens += len(prompt_token_ids)
+        offset += size
+    usage = {
+        "prompt_tokens": num_prompt_tokens,
+        "total_tokens": num_prompt_tokens,
+    }
+    return body, items, usage
+def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch.Tensor]:
+    items.sort(key=lambda x: x.index)
+    tensor_list: list[torch.Tensor] = []
+    for item in items:
+        binary = body[item.start : item.end]
+        tensor = binary2tensor(binary, item.shape, item.embed_dtype, item.endianness)
+        tensor_list.append(tensor)
+    return tensor_list

vllm/utils/system_utils.py ADDED Viewed

@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+import contextlib
+import multiprocessing
+import os
+import signal
+import sys
+from collections.abc import Callable, Iterator
+from pathlib import Path
+from typing import TextIO
+import psutil
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.ray.lazy_utils import is_in_ray_actor
+from .platform_utils import cuda_is_initialized, xpu_is_initialized
+logger = init_logger(__name__)
+CYAN = "\033[1;36m"
+RESET = "\033[0;0m"
+# Environment variable utilities
+def update_environment_variables(envs_dict: dict[str, str]):
+    """Update multiple environment variables with logging."""
+    for k, v in envs_dict.items():
+        if k in os.environ and os.environ[k] != v:
+            logger.warning(
+                "Overwriting environment variable %s from '%s' to '%s'",
+                k,
+                os.environ[k],
+                v,
+            )
+        os.environ[k] = v
+@contextlib.contextmanager
+def set_env_var(key: str, value: str) -> Iterator[None]:
+    """Temporarily set an environment variable."""
+    old = os.environ.get(key)
+    os.environ[key] = value
+    try:
+        yield
+    finally:
+        if old is None:
+            os.environ.pop(key, None)
+        else:
+            os.environ[key] = old
+# File path utilities
+def unique_filepath(fn: Callable[[int], Path]) -> Path:
+    """Generate a unique file path by trying incrementing integers.
+    Note: This function has a TOCTOU race condition.
+    Caller should use atomic operations (e.g., open with 'x' mode)
+    when creating the file to ensure thread safety.
+    """
+    i = 0
+    while True:
+        p = fn(i)
+        if not p.exists():
+            return p
+        i += 1
+# Process management utilities
+def _maybe_force_spawn():
+    """Check if we need to force the use of the `spawn` multiprocessing start
+    method.
+    """
+    if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
+        return
+    reasons = []
+    if is_in_ray_actor():
+        # even if we choose to spawn, we need to pass the ray address
+        # to the subprocess so that it knows how to connect to the ray cluster.
+        # env vars are inherited by subprocesses, even if we use spawn.
+        import ray
+        os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
+        reasons.append("In a Ray actor and can only be spawned")
+    if cuda_is_initialized():
+        reasons.append("CUDA is initialized")
+    elif xpu_is_initialized():
+        reasons.append("XPU is initialized")
+    if reasons:
+        logger.warning(
+            "We must use the `spawn` multiprocessing start method. "
+            "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+            "See https://docs.vllm.ai/en/latest/usage/"
+            "troubleshooting.html#python-multiprocessing "
+            "for more information. Reasons: %s",
+            "; ".join(reasons),
+        )
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+def get_mp_context():
+    """Get a multiprocessing context with a particular method (spawn or fork).
+    By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
+    determine the multiprocessing method (default is fork). However, under
+    certain conditions, we may enforce spawn and override the value of
+    VLLM_WORKER_MULTIPROC_METHOD.
+    """
+    _maybe_force_spawn()
+    mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
+    return multiprocessing.get_context(mp_method)
+def set_process_title(
+    name: str,
+    suffix: str = "",
+    prefix: str = envs.VLLM_PROCESS_NAME_PREFIX,
+) -> None:
+    """Set the current process title with optional suffix."""
+    try:
+        import setproctitle
+    except ImportError:
+        return
+    if suffix:
+        name = f"{name}_{suffix}"
+    setproctitle.setproctitle(f"{prefix}::{name}")
+def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+    """Add colored prefix to file output for log decoration."""
+    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    file_write = file.write
+    def write_with_prefix(s: str):
+        if not s:
+            return
+        if file.start_new_line:  # type: ignore[attr-defined]
+            file_write(prefix)
+        idx = 0
+        while (next_idx := s.find("\n", idx)) != -1:
+            next_idx += 1
+            file_write(s[idx:next_idx])
+            if next_idx == len(s):
+                file.start_new_line = True  # type: ignore[attr-defined]
+                return
+            file_write(prefix)
+            idx = next_idx
+        file_write(s[idx:])
+        file.start_new_line = False  # type: ignore[attr-defined]
+    file.start_new_line = True  # type: ignore[attr-defined]
+    file.write = write_with_prefix  # type: ignore[method-assign]
+def decorate_logs(process_name: str | None = None) -> None:
+    """Decorate stdout/stderr with process name and PID prefix."""
+    if process_name is None:
+        process_name = get_mp_context().current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
+def kill_process_tree(pid: int):
+    """
+    Kills all descendant processes of the given pid by sending SIGKILL.
+    Args:
+        pid (int): Process ID of the parent process
+    """
+    try:
+        parent = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        return
+    # Get all children recursively
+    children = parent.children(recursive=True)
+    # Send SIGKILL to all children first
+    for child in children:
+        with contextlib.suppress(ProcessLookupError):
+            os.kill(child.pid, signal.SIGKILL)
+    # Finally kill the parent
+    with contextlib.suppress(ProcessLookupError):
+        os.kill(pid, signal.SIGKILL)
+# Resource utilities
+# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630
+def set_ulimit(target_soft_limit: int = 65535):
+    if sys.platform.startswith("win"):
+        logger.info("Windows detected, skipping ulimit adjustment.")
+        return
+    import resource
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            logger.warning(
+                "Found ulimit of %s and failed to automatically increase "
+                "with error %s. This can cause fd limit errors like "
+                "`OSError: [Errno 24] Too many open files`. Consider "
+                "increasing with ulimit -n",
+                current_soft,
+                e,
+            )