PyPI - vllm-cpu-avx512vnni - Versions diffs - 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-avx512vnni 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1395) hide show

vllm/inputs/registry.py ADDED Viewed

@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+import torch
+from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
+from typing_extensions import TypeVar
+from vllm.logger import init_logger
+from vllm.transformers_utils.processor import cached_processor_from_config
+from vllm.utils import get_allowed_kwarg_only_overrides
+from vllm.utils.jsontree import JSONTree, json_map_leaves
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
+                                 MultiModalRegistry)
+    from vllm.sequence import SequenceData
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    ModelConfig = Any
+    MultiModalDataDict = Any
+    MultiModalPlaceholderDict = Any
+    MultiModalRegistry = Any
+    SequenceData = Any
+    AnyTokenizer = Any
+_T = TypeVar("_T")
+_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+logger = init_logger(__name__)
+@dataclass(frozen=True)
+class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+    model_config: ModelConfig
+    """The configuration of the model."""
+    def get_hf_config(
+        self,
+        typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
+        /,
+    ) -> _C:
+        """
+        Get the HuggingFace configuration
+        (`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+        Raises:
+            TypeError: If the configuration is not of the specified type.
+        """
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, typ):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {typ}, but "
+                            f"found type: {type(hf_config)}")
+        return hf_config
+    def get_hf_image_processor_config(self) -> dict[str, Any]:
+        """
+        Get the HuggingFace image processor configuration of the model.
+        """
+        return self.model_config.hf_image_processor_config
+    def get_mm_config(self):
+        """
+        Get the multimodal config of the model.
+        Raises:
+            RuntimeError: If the model is not a multimodal model.
+        """
+        mm_config = self.model_config.multimodal_config
+        if mm_config is None:
+            raise RuntimeError("Not a multimodal model")
+        return mm_config
+    def get_hf_processor(
+        self,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> _P:
+        """
+        Get the HuggingFace processor
+        (`transformers.ProcessorMixin`) of the model,
+        additionally checking its type.
+        Raises:
+            TypeError: If the processor is not of the specified type.
+        """
+        return cached_processor_from_config(
+            self.model_config,
+            processor_cls=typ,
+            **kwargs,
+        )
+    def init_processor(
+        self,
+        typ: type[_T],
+        /,
+        **kwargs: object,
+    ) -> _T:
+        """
+        Initialize a HuggingFace-like processor class, merging the
+        keyword arguments with those in the model's configuration.
+        """
+        mm_config = self.model_config.get_multimodal_config()
+        base_kwargs = mm_config.mm_processor_kwargs
+        if base_kwargs is None:
+            base_kwargs = {}
+        merged_kwargs = {**base_kwargs, **kwargs}
+        return typ(**merged_kwargs)
+@dataclass(frozen=True)
+class InputProcessingContext(InputContext):
+    tokenizer: AnyTokenizer
+    """The tokenizer used to tokenize the inputs."""
+    def get_hf_processor(
+        self,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+        /,
+        **kwargs: object,
+    ) -> _P:
+        return super().get_hf_processor(
+            typ,
+            tokenizer=self.tokenizer,
+            **kwargs,
+        )
+    def call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        data: Mapping[str, object],
+        kwargs: Mapping[str, object] = {},
+    ) -> Union[BatchFeature, JSONTree]:
+        """
+        Call `hf_processor` on the prompt `data`
+        (text, image, audio...) with configurable options `kwargs`.
+        """
+        assert callable(hf_processor)
+        mm_config = self.model_config.get_multimodal_config()
+        merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
+        allowed_kwargs = get_allowed_kwarg_only_overrides(
+            hf_processor,
+            merged_kwargs,
+            requires_kw_only=False,
+            allow_var_kwargs=True,
+        )
+        def maybe_cast_dtype(x):
+            # This mimics the behavior of transformers.BatchFeature
+            if isinstance(x, torch.Tensor) and x.is_floating_point():
+                return x.to(dtype=self.model_config.dtype)
+            return x
+        try:
+            output = hf_processor(**data,
+                                  **allowed_kwargs,
+                                  return_tensors="pt")
+            # this emulates output.to(dtype=self.model_config.dtype)
+            if isinstance(output, BatchFeature):
+                cast_output = json_map_leaves(maybe_cast_dtype, output.data)
+                return BatchFeature(cast_output)
+            cast_output = json_map_leaves(maybe_cast_dtype, output)
+            logger.warning_once(
+                f"{type(hf_processor).__name__} did not return `BatchFeature`. "
+                "Make sure to match the behaviour of `ProcessorMixin` when "
+                "implementing custom processors.")
+            return cast_output
+        except Exception as exc:
+            msg = (f"Failed to apply {type(hf_processor).__name__} "
+                   f"on data={data} with kwargs={allowed_kwargs}")
+            raise ValueError(msg) from exc
+class DummyData(NamedTuple):
+    """
+    Dummy data used for profiling.
+    Note: This is only used in V0.
+    """
+    seq_data: SequenceData
+    multi_modal_data: Optional[MultiModalDataDict] = None
+    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
+class InputRegistry:
+    """
+    Note: This is only used in V0.
+    """
+    def dummy_data_for_profiling(
+        self,
+        model_config: ModelConfig,
+        seq_len: int,
+        mm_registry: MultiModalRegistry,
+        is_encoder_data: bool = False,
+    ) -> DummyData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+        The model is identified by ``model_config``.
+        """
+        # Avoid circular import
+        from vllm.multimodal.cache import processor_only_cache_from_config
+        from vllm.sequence import SequenceData
+        if not model_config.is_multimodal_model:
+            seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
+            return DummyData(seq_data=seq_data)
+        cache = processor_only_cache_from_config(model_config, mm_registry)
+        # Encoder dummy data does not contain multi-modal data
+        if is_encoder_data:
+            enc_data = mm_registry.get_encoder_dummy_data(model_config,
+                                                          seq_len,
+                                                          cache=cache)
+            seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
+            return DummyData(seq_data=seq_data)
+        dec_data = mm_registry.get_decoder_dummy_data(model_config,
+                                                      seq_len,
+                                                      cache=cache)
+        return DummyData(
+            seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
+            multi_modal_data=dec_data.multi_modal_data.get_data(),
+            multi_modal_placeholders=dec_data.multi_modal_placeholders,
+        )

vllm/logger.py ADDED Viewed

@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Logging configuration for vLLM."""
+import datetime
+import json
+import logging
+import os
+import sys
+from collections.abc import Hashable
+from functools import lru_cache, partial
+from logging import Logger
+from logging.config import dictConfig
+from os import path
+from types import MethodType
+from typing import Any, Optional, cast
+import vllm.envs as envs
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
+VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
+VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM
+_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+           "[%(fileinfo)s:%(lineno)d] %(message)s")
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+DEFAULT_LOGGING_CONFIG = {
+    "formatters": {
+        "vllm": {
+            "class": "vllm.logging_utils.NewLineFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
+    },
+    "handlers": {
+        "vllm": {
+            "class": "logging.StreamHandler",
+            "formatter": "vllm",
+            "level": VLLM_LOGGING_LEVEL,
+            "stream": VLLM_LOGGING_STREAM,
+        },
+    },
+    "loggers": {
+        "vllm": {
+            "handlers": ["vllm"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+    "version": 1,
+    "disable_existing_loggers": False
+}
+@lru_cache
+def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.debug(msg, *args, stacklevel=2)
+@lru_cache
+def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, *args, stacklevel=2)
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, *args, stacklevel=2)
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the [`logging.Logger`][]
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+    def debug_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`debug`][logging.Logger.debug], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_debug_once(self, msg, *args)
+    def info_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`info`][logging.Logger.info], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_info_once(self, msg, *args)
+    def warning_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`warning`][logging.Logger.warning], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_warning_once(self, msg, *args)
+# Pre-defined methods mapping to avoid repeated dictionary creation
+_METHODS_TO_PATCH = {
+    "debug_once": _print_debug_once,
+    "info_once": _print_info_once,
+    "warning_once": _print_warning_once,
+}
+def _configure_vllm_root_logger() -> None:
+    logging_config = dict[str, Any]()
+    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+        raise RuntimeError(
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
+            "implies VLLM_CONFIGURE_LOGGING. Please enable "
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
+    if VLLM_CONFIGURE_LOGGING:
+        logging_config = DEFAULT_LOGGING_CONFIG
+    if VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+            raise RuntimeError(
+                "Could not load logging config. File does not exist: %s",
+                VLLM_LOGGING_CONFIG_PATH)
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+            custom_config = json.loads(file.read())
+        if not isinstance(custom_config, dict):
+            raise ValueError("Invalid logging config. Expected dict, got %s.",
+                             type(custom_config).__name__)
+        logging_config = custom_config
+    for formatter in logging_config.get("formatters", {}).values():
+        # This provides backwards compatibility after #10134.
+        if formatter.get("class") == "vllm.logging.NewLineFormatter":
+            formatter["class"] = "vllm.logging_utils.NewLineFormatter"
+    if logging_config:
+        dictConfig(logging_config)
+def init_logger(name: str) -> _VllmLogger:
+    """The main purpose of this function is to ensure that loggers are
+    retrieved in such a way that we can be sure the root vllm logger has
+    already been configured."""
+    logger = logging.getLogger(name)
+    for method_name, method in _METHODS_TO_PATCH.items():
+        setattr(logger, method_name, MethodType(method, logger))
+    return cast(_VllmLogger, logger)
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_configure_vllm_root_logger()
+logger = init_logger(__name__)
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+    if event in ['call', 'return']:
+        # Extract the filename, line number, function name, and the code object
+        filename = frame.f_code.co_filename
+        lineno = frame.f_lineno
+        func_name = frame.f_code.co_name
+        if not filename.startswith(root_dir):
+            # only log the functions in the vllm root_dir
+            return
+        # Log every function call or return
+        try:
+            last_frame = frame.f_back
+            if last_frame is not None:
+                last_filename = last_frame.f_code.co_filename
+                last_lineno = last_frame.f_lineno
+                last_func_name = last_frame.f_code.co_name
+            else:
+                # initial frame
+                last_filename = ""
+                last_lineno = 0
+                last_func_name = ""
+            with open(log_path, 'a') as f:
+                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                if event == 'call':
+                    f.write(f"{ts} Call to"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" from {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+                else:
+                    f.write(f"{ts} Return from"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" to {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+        except NameError:
+            # modules are deleted during shutdown
+            pass
+    return partial(_trace_calls, log_path, root_dir)
+def enable_trace_function_call(log_file_path: str,
+                               root_dir: Optional[str] = None):
+    """
+    Enable tracing of every function call in code under `root_dir`.
+    This is useful for debugging hangs or crashes.
+    `log_file_path` is the path to the log file.
+    `root_dir` is the root directory of the code to trace. If None, it is the
+    vllm root directory.
+    Note that this call is thread-level, any threads calling this function
+    will have the trace enabled. Other threads will not be affected.
+    """
+    logger.warning(
+        "VLLM_TRACE_FUNCTION is enabled. It will record every"
+        " function executed by Python. This will slow down the code. It "
+        "is suggested to be used for debugging hang or crashes only.")
+    logger.info("Trace frame log is saved to %s", log_file_path)
+    if root_dir is None:
+        # by default, this is the vllm root directory
+        root_dir = os.path.dirname(os.path.dirname(__file__))
+    sys.settrace(partial(_trace_calls, log_file_path, root_dir))

vllm/logging_utils/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.logging_utils.formatter import NewLineFormatter
+__all__ = [
+    "NewLineFormatter",
+]

vllm/logging_utils/dump_input.py ADDED Viewed

@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import enum
+import json
+from typing import Optional
+import torch
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.version import __version__ as VLLM_VERSION
+logger = init_logger(__name__)
+def prepare_object_to_dump(obj) -> str:
+    if isinstance(obj, str):
+        return f"'{obj}'"  # Double quotes
+    elif isinstance(obj, dict):
+        dict_str = ', '.join({f'{str(k)}: {prepare_object_to_dump(v)}' \
+            for k, v in obj.items()})
+        return f'{{{dict_str}}}'
+    elif isinstance(obj, list):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]"
+    elif isinstance(obj, set):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in list(obj)])}]"
+        # return [prepare_object_to_dump(v) for v in list(obj)]
+    elif isinstance(obj, tuple):
+        return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]"
+    elif isinstance(obj, enum.Enum):
+        return repr(obj)
+    elif isinstance(obj, torch.Tensor):
+        # We only print the 'draft' of the tensor to not expose sensitive data
+        # and to get some metadata in case of CUDA runtime crashed
+        return (f"Tensor(shape={obj.shape}, "
+                f"device={obj.device},"
+                f"dtype={obj.dtype})")
+    elif hasattr(obj, 'anon_repr'):
+        return obj.anon_repr()
+    elif hasattr(obj, '__dict__'):
+        items = obj.__dict__.items()
+        dict_str = ', '.join([f'{str(k)}={prepare_object_to_dump(v)}' \
+            for k, v in items])
+        return f"{type(obj).__name__}({dict_str})"
+    else:
+        # Hacky way to make sure we can serialize the object in JSON format
+        try:
+            return json.dumps(obj)
+        except (TypeError, OverflowError):
+            return repr(obj)
+def dump_engine_exception(config: VllmConfig,
+                          scheduler_output: SchedulerOutput,
+                          scheduler_stats: Optional[SchedulerStats]):
+    # NOTE: ensure we can log extra info without risking raises
+    # unexpected errors during logging
+    with contextlib.suppress(Exception):
+        _dump_engine_exception(config, scheduler_output, scheduler_stats)
+def _dump_engine_exception(config: VllmConfig,
+                           scheduler_output: SchedulerOutput,
+                           scheduler_stats: Optional[SchedulerStats]):
+    logger.error(
+        "Dumping input data for V1 LLM engine (v%s) with config: %s, ",
+        VLLM_VERSION,
+        config,
+    )
+    try:
+        dump_obj = prepare_object_to_dump(scheduler_output)
+        logger.error("Dumping scheduler output for model execution: %s",
+                     dump_obj)
+        if scheduler_stats:
+            logger.error("Dumping scheduler stats: %s", scheduler_stats)
+    except Exception:
+        logger.exception("Error preparing object to dump")

vllm/logging_utils/formatter.py ADDED Viewed

@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from pathlib import Path
+from vllm import envs
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+    def __init__(self, fmt, datefmt=None, style="%"):
+        super().__init__(fmt, datefmt, style)
+        self.use_relpath = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+        if self.use_relpath:
+            self.root_dir = Path(__file__).resolve().parent.parent.parent
+    def format(self, record):
+        def shrink_path(relpath: Path) -> str:
+            """
+            Shortens a file path for logging display:
+            - Removes leading 'vllm' folder if present.
+            - If path starts with 'v1',
+            keeps the first two and last two levels,
+            collapsing the middle as '...'.
+            - Otherwise, keeps the first and last two levels,
+            collapsing the middle as '...'.
+            - If the path is short, returns it as-is.
+            - Examples:
+            vllm/model_executor/layers/quantization/utils/fp8_utils.py ->
+            model_executor/.../quantization/utils/fp8_utils.py
+            vllm/model_executor/layers/quantization/awq.py ->
+            model_executor/layers/quantization/awq.py
+            vllm/v1/attention/backends/mla/common.py ->
+            v1/attention/backends/mla/common.py
+            Args:
+                relpath (Path): The relative path to be shortened.
+            Returns:
+                str: The shortened path string for display.
+            """
+            parts = list(relpath.parts)
+            new_parts = []
+            if parts and parts[0] == "vllm":
+                parts = parts[1:]
+            if parts and parts[0] == "v1":
+                new_parts += parts[:2]
+                parts = parts[2:]
+            elif parts:
+                new_parts += parts[:1]
+                parts = parts[1:]
+            if len(parts) > 2:
+                new_parts += ["..."] + parts[-2:]
+            else:
+                new_parts += parts
+            return "/".join(new_parts)
+        if self.use_relpath:
+            abs_path = getattr(record, "pathname", None)
+            if abs_path:
+                try:
+                    relpath = Path(abs_path).resolve().relative_to(
+                        self.root_dir)
+                except Exception:
+                    relpath = Path(record.filename)
+            else:
+                relpath = Path(record.filename)
+            record.fileinfo = shrink_path(relpath)
+        else:
+            record.fileinfo = record.filename
+        msg = super().format(record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg