PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/v1/engine/detokenizer.py ADDED Viewed

@@ -0,0 +1,261 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from typing import Optional
+import tokenizers
+from packaging import version
+from tokenizers import Tokenizer
+from tokenizers.decoders import DecodeStream
+from transformers import PreTrainedTokenizerFast
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+from vllm.v1.engine import EngineCoreRequest
+logger = init_logger(__name__)
+class IncrementalDetokenizer:
+    def __init__(self):
+        self.token_ids: list[int] = []
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        self.token_ids.extend(new_token_ids)
+        return None
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        return ""
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: Optional[AnyTokenizer],
+        request: EngineCoreRequest,
+    ) -> "IncrementalDetokenizer":
+        if tokenizer is None:
+            # No tokenizer => skipping detokenization.
+            return IncrementalDetokenizer()
+        if (isinstance(tokenizer, PreTrainedTokenizerFast) and version.parse(
+                tokenizers.__version__) >= version.parse("0.21.1")):
+            # Fast tokenizer => use tokenizers library DecodeStream.
+            # And only tokenizers >= 0.21.1 supports Fast Detokenizer.
+            return FastIncrementalDetokenizer(tokenizer, request)
+        # Fall back to slow python-based incremental detokenization.
+        return SlowIncrementalDetokenizer(tokenizer, request)
+class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__()
+        # Stop strings
+        params = request.sampling_params
+        self.stop = stop = params.stop
+        self.include_stop_str_in_output = params.include_stop_str_in_output
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in stop) - 1
+        else:
+            self.stop_buffer_length = 0
+        self._last_output_text_offset: int = 0
+        # Generation data
+        self.output_text = ""
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally.
+            2) Evaluate stop criteria.
+        Return matched stop string or None.
+        """
+        if not new_token_ids:
+            # Skip detokenization if no new token ids.
+            return None
+        if stop_terminated and not self.include_stop_str_in_output:
+            # If stop-terminated, exclude last token from detokenization
+            # based on include_stop_str_in_output parameter.
+            skipped_stop_token_id = new_token_ids[-1]
+            new_token_ids = new_token_ids[:-1]
+        else:
+            skipped_stop_token_id = None
+        # 1) Detokenize the new token ids incrementally.
+        # TODO(woosuk): This method becomes very inefficient when the number of
+        # new_token_ids is more than 1. We need to optimize this.
+        offset_before = len(self.output_text)
+        for new_token_id in new_token_ids:
+            self.token_ids.append(new_token_id)
+            self.output_text += self.decode_next(new_token_id)
+        if stop_terminated:
+            if skipped_stop_token_id is not None:
+                # Cleanup after skipping detokenization.
+                self.token_ids.append(skipped_stop_token_id)
+            # Stop token triggered; skip stop string check.
+            return None
+        # 2) Evaluate stop strings.
+        stop_string = None
+        if self.stop:
+            stop = StopChecker.check_stop_strings(
+                output_text=self.output_text,
+                new_char_count=len(self.output_text) - offset_before,
+                stop=self.stop,
+                include_in_output=self.include_stop_str_in_output,
+            )
+            if stop is not None:
+                stop_string, truncate_to = stop
+                if truncate_to != -1:
+                    self.output_text = self.output_text[:truncate_to]
+        return stop_string
+    @abstractmethod
+    def decode_next(self, next_token_id: int) -> str:
+        raise NotImplementedError
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+        # We return the full output text if the sequence is finished.
+        buffer_length = 0 if finished else self.stop_buffer_length
+        if not delta:
+            return self.output_text[:-buffer_length] if buffer_length else (
+                self.output_text)
+        length = len(self.output_text) - buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
+    def __init__(self, tokenizer: PreTrainedTokenizerFast,
+                 request: EngineCoreRequest):
+        super().__init__(request)
+        sampling_params = request.sampling_params
+        self.stream = DecodeStream(
+            skip_special_tokens=sampling_params.skip_special_tokens)
+        self.tokenizer: Tokenizer = tokenizer._tokenizer
+        # Find a safe place to start.
+        prompt_suffix = request.prompt_token_ids
+        prompt_len = len(prompt_suffix)
+        if prompt_len > 4:
+            for i in range(4, min(prompt_len + 1, 24)):
+                suffix = request.prompt_token_ids[-i:]
+                if '�' not in self.tokenizer.decode(suffix):
+                    prompt_suffix = suffix
+                    break
+        # Prime the stream.
+        for tid in prompt_suffix:
+            self.stream.step(self.tokenizer, tid)
+        self.spaces_between_special_tokens = (
+            sampling_params.skip_special_tokens
+            or sampling_params.spaces_between_special_tokens)
+        if not self.spaces_between_special_tokens:
+            # Store dict of added token ids so that we can suppress
+            # the spaces between them.
+            if (added_token_ids := getattr(self.tokenizer, "added_token_ids",
+                                           None)) is None:
+                self.tokenizer.added_token_ids = added_token_ids = {
+                    tid: tok.content
+                    for tid, tok in
+                    self.tokenizer.get_added_tokens_decoder().items()
+                }
+            if added_token_ids:
+                self.last_special = False
+                self.added_token_ids = added_token_ids
+            else:
+                # No added tokens.
+                self.spaces_between_special_tokens = True
+    def decode_next(self, next_token_id: int) -> str:
+        token = self.stream.step(self.tokenizer, next_token_id)
+        if not self.spaces_between_special_tokens:
+            special_token = self.added_token_ids.get(next_token_id)
+            is_special = special_token is not None
+            if is_special and self.last_special:
+                # Return raw token string without any prefixed spaces.
+                token = special_token
+            self.last_special = is_special
+        return token or ""
+class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
+    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+        super().__init__(request)
+        self.tokenizer = tokenizer
+        # Metadata for incremental detokenization.
+        self.tokens, self.prefix_offset, self.read_offset = (
+            convert_prompt_ids_to_tokens(
+                tokenizer=tokenizer,
+                prompt_ids=request.prompt_token_ids,
+                skip_special_tokens=request.sampling_params.
+                skip_special_tokens,
+            ))
+        self.token_ids.extend(request.prompt_token_ids)
+        self.prompt_len = len(request.prompt_token_ids)
+        params = request.sampling_params
+        self.skip_special_tokens = params.skip_special_tokens
+        self.spaces_between_special_tokens = (
+            params.spaces_between_special_tokens)
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
+    def decode_next(self, next_token_id: int) -> str:
+        new_tokens, decoded_text, prefix_offset, read_offset = (
+            detokenize_incrementally(
+                tokenizer=self.tokenizer,
+                all_input_ids=self.token_ids,
+                prev_tokens=self.tokens,
+                prefix_offset=self.prefix_offset,
+                read_offset=self.read_offset,
+                skip_special_tokens=self.skip_special_tokens,
+                spaces_between_special_tokens=self.
+                spaces_between_special_tokens,
+            ))
+        self.tokens.extend(new_tokens)
+        self.prefix_offset = prefix_offset
+        self.read_offset = read_offset
+        return decoded_text

vllm/v1/engine/exceptions.py ADDED Viewed

@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
+    pass
+class EngineDeadError(Exception):
+    """Raised when the EngineCore dies. Unrecoverable."""
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context

vllm/v1/engine/llm_engine.py ADDED Viewed

@@ -0,0 +1,317 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Mapping
+from copy import copy
+from typing import Any, Callable, Optional, Union
+from typing_extensions import TypeVar
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.outputs import RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import (
+    TokenizerGroup, init_tokenizer_from_configs)
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Device
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
+                                     StatLoggerFactory)
+from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
+from vllm.v1.metrics.stats import IterationStats
+logger = init_logger(__name__)
+_R = TypeVar("_R", default=Any)
+class LLMEngine:
+    """Legacy LLMEngine for backwards compatibility."""
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        multiprocess_mode: bool = False,
+    ) -> None:
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "LLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+        if stat_loggers is not None:
+            raise NotImplementedError(
+                "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
+                "Set VLLM_USE_V1=0 and file and issue on Github.")
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.log_stats = log_stats
+        self.stat_logger: Optional[StatLoggerBase] = None
+        if self.log_stats:
+            self.stat_logger = PrometheusStatLogger(vllm_config)
+        # important: init dp group before init the engine_core
+        # In the decoupled engine case this is handled in EngineCoreProc.
+        parallel_config = vllm_config.parallel_config
+        if not multiprocess_mode and parallel_config.data_parallel_size > 1:
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        else:
+            self.dp_group = None
+        self.should_execute_dummy_batch = False
+        # Tokenizer (+ ensure liveness if running in another process).
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            lora_config=vllm_config.lora_config)
+        # Processor (convert Inputs --> EngineCoreRequests)
+        self.processor = Processor(vllm_config=vllm_config,
+                                   tokenizer=self.tokenizer,
+                                   mm_registry=mm_registry)
+        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
+        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
+        self.engine_core = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocess_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=self.log_stats,
+        )
+        if not multiprocess_mode:
+            # for v0 compatibility
+            self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
+        # Don't keep the dummy data in memory
+        self.reset_mm_cache()
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        disable_log_stats: bool = False,
+    ) -> "LLMEngine":
+        return cls(vllm_config=vllm_config,
+                   executor_class=Executor.get_class(vllm_config),
+                   log_stats=(not disable_log_stats),
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        enable_multiprocessing: bool = False,
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        vllm_config = engine_args.create_engine_config(usage_context)
+        executor_class = Executor.get_class(vllm_config)
+        if envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+            logger.debug("Enabling multiprocessing for LLMEngine.")
+            enable_multiprocessing = True
+        # Create the LLMEngine.
+        return cls(vllm_config=vllm_config,
+                   executor_class=executor_class,
+                   log_stats=not engine_args.disable_log_stats,
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=enable_multiprocessing)
+    def get_num_unfinished_requests(self) -> int:
+        return self.output_processor.get_num_unfinished_requests()
+    def has_unfinished_requests(self) -> bool:
+        has_unfinished = self.output_processor.has_unfinished_requests()
+        if self.dp_group is None:
+            return has_unfinished
+        return self.has_unfinished_requests_dp(has_unfinished)
+    def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
+        aggregated_has_unfinished = ParallelConfig.has_unfinished_dp(
+            self.dp_group, has_unfinished)
+        if not has_unfinished and aggregated_has_unfinished:
+            self.should_execute_dummy_batch = True
+        return aggregated_has_unfinished
+    @classmethod
+    def validate_outputs(cls, outputs, output_type):
+        return outputs
+    def abort_request(self, request_ids: list[str]) -> None:
+        """Remove request_ids from EngineCore and Detokenizer."""
+        request_ids = self.output_processor.abort_requests(request_ids)
+        self.engine_core.abort_requests(request_ids)
+    def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> None:
+        # Process raw inputs into the request.
+        prompt_str, request = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            tokenization_kwargs, trace_headers, prompt_adapter_request,
+            priority)
+        n = params.n if isinstance(params, SamplingParams) else 1
+        if n == 1:
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(request, prompt_str, None, 0)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(request)
+            return
+        # Fan out child requests (for n>1).
+        parent_req = ParentRequest(request_id, params)
+        for idx in range(n):
+            request_id, params = parent_req.get_child_info(idx)
+            child_request = request if idx == n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = params
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(child_request, prompt_str,
+                                              parent_req, idx)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(child_request)
+    def step(self) -> list[RequestOutput]:
+        if self.should_execute_dummy_batch:
+            self.should_execute_dummy_batch = False
+            self.engine_core.execute_dummy_batch()
+            return []
+        # 1) Get EngineCoreOutput from the EngineCore.
+        outputs = self.engine_core.get_output()
+        # 2) Process EngineCoreOutputs.
+        iteration_stats = IterationStats() if self.log_stats else None
+        processed_outputs = self.output_processor.process_outputs(
+            outputs.outputs,
+            engine_core_timestamp=outputs.timestamp,
+            iteration_stats=iteration_stats)
+        # 3) Abort any reqs that finished due to stop strings.
+        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
+        # 4) Record stats
+        if self.stat_logger is not None:
+            assert outputs.scheduler_stats is not None
+            self.stat_logger.record(scheduler_stats=outputs.scheduler_stats,
+                                    iteration_stats=iteration_stats)
+        return processed_outputs.request_outputs
+    def get_vllm_config(self):
+        return self.vllm_config
+    def get_model_config(self):
+        return self.model_config
+    def start_profile(self):
+        self.engine_core.profile(True)
+    def stop_profile(self):
+        self.engine_core.profile(False)
+    def reset_mm_cache(self):
+        self.processor.mm_registry.reset_processor_cache()
+        self.processor.mm_input_cache_client.reset()
+        self.engine_core.reset_mm_cache()
+    def reset_prefix_cache(self, device: Optional[Device] = None):
+        self.engine_core.reset_prefix_cache()
+    def sleep(self, level: int = 1):
+        self.engine_core.sleep(level)
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.engine_core.wake_up(tags)
+    def is_sleeping(self) -> bool:
+        return self.engine_core.is_sleeping()
+    def get_metrics(self) -> list[Metric]:
+        assert self.log_stats, "Stat logging disabled"
+        return get_metrics_snapshot()
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
+        return self.tokenizer
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return self.engine_core.add_lora(lora_request)
+    def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return self.engine_core.remove_lora(lora_id)
+    def list_loras(self) -> set[int]:
+        """List all registered adapters."""
+        return self.engine_core.list_loras()
+    def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return self.engine_core.pin_lora(lora_id)
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+    def __del__(self):
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)