PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/v1/engine/logprobs.py ADDED Viewed

@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+import itertools
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional
+from vllm.logger import init_logger
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_ids_list_to_tokens)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+logger = init_logger(__name__)
+NONES = itertools.repeat(None)
+@dataclass
+class LogprobsProcessor:
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: Optional[AnyTokenizer]
+    # Logprobs for this request
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
+    num_logprobs: Optional[int]
+    num_prompt_logprobs: Optional[int]
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: Optional[AnyTokenizer],
+        request: EngineCoreRequest,
+    ) -> "LogprobsProcessor":
+        num_logprobs = request.sampling_params.logprobs
+        num_prompt_logprobs = request.sampling_params.prompt_logprobs
+        return cls(
+            tokenizer=tokenizer,
+            cumulative_logprob=(None if num_logprobs is None else 0.),
+            logprobs=(None if num_logprobs is None else []),
+            # NOTE: logprob of first prompt token is None.
+            prompt_logprobs=(None if num_prompt_logprobs is None else [None]),
+            num_prompt_logprobs=num_prompt_logprobs,
+            num_logprobs=num_logprobs,
+        )
+    def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
+        """Update with sample logprobs from EngineCore.
+        Outer lists are only of len > 1 if EngineCore made
+        >1 tokens in prior step (e.g. in spec decoding).
+        Args:
+          logprobs_lists: the lists of logprob tokens, logprobs, and ranks.
+        """
+        assert self.num_logprobs is not None
+        assert self.logprobs is not None
+        assert self.cumulative_logprob is not None
+        token_ids_lst, logprobs_lst, ranks_lst = logprobs_lists
+        for rank, logprobs, token_ids in zip(ranks_lst, logprobs_lst,
+                                             token_ids_lst):
+            # Detokenize (non-incrementally).
+            decoded_tokens = NONES if self.tokenizer is None else (
+                convert_ids_list_to_tokens(self.tokenizer, token_ids))
+            # Sampler puts the sampled logprob in first.
+            sampled_token_logprob = logprobs[0]
+            self.cumulative_logprob += sampled_token_logprob
+            # Update with the Logprob dictionary for this pos.
+            self.logprobs.append(
+                self._make_logprob_dict(
+                    logprobs,
+                    token_ids,
+                    decoded_tokens,
+                    rank,
+                    self.num_logprobs,
+                ))
+    def _update_prompt_logprobs(
+        self,
+        prompt_logprobs_tensors: LogprobsTensors,
+    ) -> None:
+        """Update with prompt logprobs from EngineCore.
+        Args:
+          prompt_logprobs_tensors: tuple containing the prompt logprobs
+                                   tensors.
+        """
+        # Prompt logprobs are enabled.
+        assert self.num_prompt_logprobs is not None
+        assert self.prompt_logprobs is not None
+        token_ids, logprobs, ranks = prompt_logprobs_tensors
+        # Detokenize non-incrementally.
+        # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
+        decoded_tokens = None if self.tokenizer is None else (
+            convert_ids_list_to_tokens(self.tokenizer,
+                                       token_ids.flatten().tolist()))
+        # Recover shapes.
+        num_prompt_tokens, num_logprobs = logprobs.shape
+        # Pythonize the torch tensors.
+        prompt_token_ranks = ranks.tolist()
+        prompt_logprobs = logprobs.tolist()
+        token_ids = token_ids.tolist()
+        # Make Logprob for each position.
+        for pos in range(num_prompt_tokens):
+            # Handle flattening.
+            offset = pos * num_logprobs
+            offset_end = offset + num_logprobs
+            decoded_tokens_for_pos = NONES \
+            if decoded_tokens is None else decoded_tokens[offset:offset_end]
+            # Update with the Logprob dictionary for this pos.
+            self.prompt_logprobs.append(
+                self._make_logprob_dict(prompt_logprobs[pos], token_ids[pos],
+                                        decoded_tokens_for_pos,
+                                        prompt_token_ranks[pos],
+                                        self.num_prompt_logprobs))
+    def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
+        """Pop and return all request prompt logprobs
+        The logprobs processor aggregates prompt chunk logprobs
+        over one or more prefill chunks. This method returns
+        all prompt logprobs at once and then forgets them.
+        Ensures correct RequestOutputKind.DELTA semantics
+        wherein all prompt logprobs are returned at once at
+        the end of prefill.
+        Returns:
+          None if prompt logprobs are disabled for this request.
+          List of all prompt logprobs, otherwise.
+        """
+        plp = self.prompt_logprobs
+        if plp:
+            self.prompt_logprobs = []
+        return plp
+    @staticmethod
+    def _make_logprob_dict(
+        logprobs: list[float],
+        logprob_token_ids: list[int],
+        decoded_tokens: Iterable[Optional[str]],
+        rank: int,
+        num_logprobs: int,
+    ) -> dict[int, Logprob]:
+        """Make a Logprob dictionary for a position.
+        Args:
+          logprobs: list of log probabilities
+          logprob_token_ids: list of top token ids
+          decoded_tokens: list of decoded top tokens
+          rank: rank of the sampled token
+          num_logprobs: number of logprobs requested
+            by the user (in addition to sampled logprob)
+        Returns:
+          dict[token id, Logprob]
+        """
+        # We do not need a special case for the sampled token
+        # being in the topk, since inserting duplicated data
+        # into a dictionary twice is the same as doing it once.
+        topk_ranks = range(1, num_logprobs + 1)
+        ranks = itertools.chain((rank, ), topk_ranks)
+        return {
+            token_id: Logprob(
+                logprob=logprob,
+                rank=rank,
+                decoded_token=token,
+            )
+            for token_id, logprob, rank, token in zip(
+                logprob_token_ids, logprobs, ranks, decoded_tokens)
+        }
+    def update_from_output(self, output: EngineCoreOutput) -> None:
+        if output.new_logprobs is not None:
+            self._update_sample_logprobs(output.new_logprobs)
+        if output.new_prompt_logprobs_tensors is not None:
+            self._update_prompt_logprobs(output.new_prompt_logprobs_tensors)

vllm/v1/engine/mm_input_cache.py ADDED Viewed

@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Sequence
+from typing import Optional
+from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
+from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.processing import ProcessingCache
+from vllm.utils import is_list_of
+# The idea of multimodal preprocessing caching is based on having a client and
+# a server, where the client executes in the frontend process (=P0) and the
+# server in the core process (=P1).
+#
+# -- Client:
+#  - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
+#    with built-in caching functionality, with mm_hash as its identifier.
+#  - MirroredProcessingCache to keep track of the cached entries and
+#    determine whether to send the MultiModalKwargs to P1.
+#
+# -- Server:
+#  - MirroredProcessingCache to store the MultiModalKwargs from P0.
+#
+# The caching for both client and server is mirrored, and this allows us
+# to avoid the serialization of "mm_inputs" (like pixel values) between
+# client (=P0) and server (=P1) processes if the mm_hash is found in the client
+# cache.
+# Both Client and Server must use the same cache size
+# (to perform mirrored caching). This cache size is set by the environment
+# variable VLLM_MM_INPUT_CACHE_GIB.
+class MirroredProcessingCache:
+    def __init__(self, model_config):
+        self.use_cache = not model_config.disable_mm_preprocessor_cache
+        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
+                                                      MultiModalKwargs)
+    def get_and_update_p0(
+        self,
+        mm_inputs: Sequence[MultiModalKwargs],
+        mm_hashes: list[str],
+    ) -> Sequence[Optional[MultiModalKwargs]]:
+        assert len(mm_inputs) == len(mm_hashes)
+        if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
+            return mm_inputs
+        full_mm_inputs = list[Optional[MultiModalKwargs]]()
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if self.mm_cache.get(mm_hash) is not None:
+                mm_input = None
+            else:
+                self.mm_cache[mm_hash] = mm_input
+            full_mm_inputs.append(mm_input)
+        return full_mm_inputs
+    def get_and_update_p1(
+        self,
+        mm_inputs: Sequence[Optional[MultiModalKwargs]],
+        mm_hashes: list[str],
+    ) -> Sequence[MultiModalKwargs]:
+        assert len(mm_inputs) == len(mm_hashes)
+        if not self.use_cache:
+            assert is_list_of(mm_inputs, MultiModalKwargs)
+            return mm_inputs
+        full_mm_inputs = list[MultiModalKwargs]()
+        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+            if mm_input is None:
+                mm_input = self.mm_cache[mm_hash]
+            else:
+                self.mm_cache[mm_hash] = mm_input
+            full_mm_inputs.append(mm_input)
+        return full_mm_inputs

vllm/v1/engine/output_processor.py ADDED Viewed

@@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional, Union
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.sampling_params import RequestOutputKind
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+from vllm.v1.engine.logprobs import LogprobsProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
+from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
+                                   RequestStateStats)
+class RequestOutputCollector:
+    """
+    Collects streamed RequestOutputs per individual request,
+    for hand-off to the consuming asyncio generate task.
+    When streaming deltas, RequestOutputs are merged if the
+    producer gets ahead of the consumer.
+    """
+    def __init__(self, output_kind: RequestOutputKind):
+        self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.output: Optional[Union[RequestOutput, Exception]] = None
+        self.ready = asyncio.Event()
+    def put(self, output: Union[RequestOutput, Exception]) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
+            self.output = output
+            self.ready.set()
+        elif isinstance(self.output, RequestOutput):
+            # This ensures that request outputs with different request indexes
+            # (if n > 1) do not override each other.
+            self.output.add(output, aggregate=self.aggregate)
+    async def get(self) -> RequestOutput:
+        """Get operation blocks on put event."""
+        while (output := self.output) is None:
+            await self.ready.wait()
+        self.output = None
+        self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+    def get_nowait(self) -> Optional[RequestOutput]:
+        """Non-blocking get operation."""
+        output = self.output
+        if output is not None:
+            self.output = None
+            self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
+        return output
+@dataclass
+class OutputProcessorOutput:
+    request_outputs: list[RequestOutput]
+    reqs_to_abort: list[str]
+class RequestState:
+    def __init__(
+        self,
+        request_id: str,
+        parent_req: Optional[ParentRequest],
+        request_index: int,
+        lora_name: Optional[str],
+        output_kind: RequestOutputKind,
+        prompt: Optional[str],
+        prompt_token_ids: list[int],
+        logprobs_processor: LogprobsProcessor,
+        detokenizer: IncrementalDetokenizer,
+        max_tokens_param: Optional[int],
+        arrival_time: float,
+        queue: Optional[RequestOutputCollector],
+        log_stats: bool,
+    ):
+        self.request_id = request_id
+        self.parent_req = parent_req
+        self.request_index = request_index
+        self.lora_name = lora_name
+        self.output_kind = output_kind
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_len = len(prompt_token_ids)
+        self.logprobs_processor = logprobs_processor
+        self.detokenizer = detokenizer
+        self.max_tokens_param = max_tokens_param
+        self.is_prefilling = True
+        self.queue = queue
+        self.stats = RequestStateStats(
+            arrival_time=arrival_time) if log_stats else None
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        prompt: Optional[str],
+        parent_req: Optional[ParentRequest],
+        request_index: int,
+        queue: Optional[RequestOutputCollector],
+        log_stats: bool,
+    ) -> "RequestState":
+        if not request.sampling_params.detokenize:
+            tokenizer = None
+        return cls(
+            request_id=request.request_id,
+            parent_req=parent_req,
+            request_index=request_index,
+            lora_name=(request.lora_request.name
+                       if request.lora_request is not None else None),
+            output_kind=request.sampling_params.output_kind,
+            prompt=prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            logprobs_processor=LogprobsProcessor.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
+            detokenizer=IncrementalDetokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
+            max_tokens_param=(request.sampling_params.max_tokens if
+                              request.sampling_params is not None else None),
+            arrival_time=request.arrival_time,
+            queue=queue,
+            log_stats=log_stats,
+        )
+    def make_request_output(
+        self,
+        new_token_ids: list[int],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+    ) -> Optional[RequestOutput]:
+        finished = finish_reason is not None
+        final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
+        if not finished and final_only:
+            # Only the final output is required in FINAL_ONLY mode.
+            return None
+        completion_output = self._new_completion_output(
+            new_token_ids, finish_reason, stop_reason)
+        request_id = self.request_id
+        if self.parent_req is None:
+            outputs = [completion_output]
+        else:
+            request_id, outputs, finished = self.parent_req.get_outputs(
+                request_id, completion_output)
+            if not outputs:
+                return None
+        return self._new_request_output(request_id, outputs, finished)
+    def _new_request_output(
+        self,
+        request_id: str,
+        outputs: list[CompletionOutput],
+        finished: bool,
+    ) -> RequestOutput:
+        if self.output_kind == RequestOutputKind.DELTA:
+            # Side effect: logprobs processor forgets prompt logprobs
+            prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs()
+        else:
+            prompt_logprobs = self.logprobs_processor.prompt_logprobs
+        return RequestOutput(
+            request_id=request_id,
+            prompt=self.prompt,
+            prompt_token_ids=self.prompt_token_ids,
+            prompt_logprobs=prompt_logprobs,
+            outputs=outputs,
+            finished=finished,
+        )
+    def _new_completion_output(
+        self,
+        token_ids: list[int],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+    ) -> CompletionOutput:
+        finished = finish_reason is not None
+        delta = self.output_kind == RequestOutputKind.DELTA
+        # Prepare text and token_ids, based on delta mode
+        text = self.detokenizer.get_next_output_text(finished, delta)
+        if not delta:
+            token_ids = self.detokenizer.output_token_ids
+        # Prepare logprobs, based on delta mode
+        logprobs = self.logprobs_processor.logprobs
+        if delta and logprobs:
+            logprobs = logprobs[-len(token_ids):]
+        return CompletionOutput(
+            index=self.request_index,
+            text=text,
+            token_ids=token_ids,
+            logprobs=logprobs,
+            cumulative_logprob=self.logprobs_processor.cumulative_logprob,
+            finish_reason=str(finish_reason) if finished else None,
+            stop_reason=stop_reason if finished else None)
+class OutputProcessor:
+    """Process EngineCoreOutputs into RequestOutputs."""
+    def __init__(
+        self,
+        tokenizer: TokenizerGroup,
+        log_stats: bool,
+    ):
+        self.log_stats = log_stats
+        self.tokenizer = tokenizer
+        self.request_states: dict[str, RequestState] = {}
+        self.parent_requests: dict[str, ParentRequest] = {}
+        self.lora_states = LoRARequestStates()
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put(e)
+    def abort_requests(
+        self,
+        request_ids: Iterable[str],
+    ) -> list[str]:
+        request_ids_to_abort = []
+        for request_id in request_ids:
+            req_state = self.request_states.pop(request_id, None)
+            if req_state is not None:
+                self.lora_states.abort_request(req_state)
+                request_ids_to_abort.append(request_id)
+            else:
+                parent = self.parent_requests.pop(request_id, None)
+                if parent and parent.child_requests:
+                    self.abort_requests(parent.child_requests)
+                    request_ids_to_abort.extend(parent.child_requests)
+        return request_ids_to_abort
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+        prompt: Optional[str],
+        parent_req: Optional[ParentRequest] = None,
+        request_index: int = 0,
+        queue: Optional[RequestOutputCollector] = None,
+    ) -> None:
+        request_id = request.request_id
+        if request_id in self.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
+        req_state = RequestState.from_new_request(
+            tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
+            request=request,
+            prompt=prompt,
+            parent_req=parent_req,
+            request_index=request_index,
+            queue=queue,
+            log_stats=self.log_stats)
+        self.request_states[request_id] = req_state
+        self.lora_states.add_request(req_state)
+        if parent_req:
+            self.parent_requests[parent_req.request_id] = parent_req
+    def process_outputs(
+        self,
+        engine_core_outputs: list[EngineCoreOutput],
+        engine_core_timestamp: Optional[float] = None,
+        iteration_stats: Optional[IterationStats] = None,
+    ) -> OutputProcessorOutput:
+        """
+        Process the EngineCoreOutputs:
+        1) Compute stats for logging
+        2) Detokenize
+        3) Create and handle RequestOutput objects:
+            * If there is a queue (for usage with AsyncLLM),
+              put the RequestOutput objects into the queue for
+              handling by the per-request generate() tasks.
+            * If there is no queue (for usage with LLMEngine),
+              return a list of RequestOutput objects.
+        ****************** NOTE FOR DEVELOPERS ******************
+        vLLM V1 minimizes the number of python loops over the full
+        batch to ensure system overheads are minimized. This is the
+        only function that should loop over EngineCoreOutputs.
+        If you need to touch every element of the batch, do it from
+        within the loop below.
+        **********************************************************
+        """
+        request_outputs: list[RequestOutput] = []
+        reqs_to_abort: list[str] = []
+        for engine_core_output in engine_core_outputs:
+            req_id = engine_core_output.request_id
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
+                # Ignore output for already-aborted request.
+                continue
+            # 1) Compute stats for this iteration.
+            self._update_stats_from_output(req_state, engine_core_output,
+                                           engine_core_timestamp,
+                                           iteration_stats)
+            new_token_ids = engine_core_output.new_token_ids
+            finish_reason = engine_core_output.finish_reason
+            stop_reason = engine_core_output.stop_reason
+            req_state.is_prefilling = False
+            # 2) Detokenize the token ids into text and perform stop checks.
+            stop_string = req_state.detokenizer.update(
+                new_token_ids, finish_reason == FinishReason.STOP)
+            if stop_string:
+                finish_reason = FinishReason.STOP
+                stop_reason = stop_string
+            # 3) Compute sample and prompt logprobs for request, if required.
+            req_state.logprobs_processor.update_from_output(engine_core_output)
+            # 4) Create and handle RequestOutput objects.
+            if request_output := req_state.make_request_output(
+                    new_token_ids, finish_reason, stop_reason):
+                if req_state.queue is not None:
+                    # AsyncLLM: put into queue for handling by generate().
+                    req_state.queue.put(request_output)
+                else:
+                    # LLMEngine: return list of RequestOutputs.
+                    request_outputs.append(request_output)
+            # Free completed requests.
+            if finish_reason is not None:
+                self.request_states.pop(req_id)
+                # Remove parent request if applicable.
+                parent_req = req_state.parent_req
+                if parent_req and not parent_req.child_requests:
+                    self.parent_requests.pop(parent_req.request_id, None)
+                if not engine_core_output.finished:
+                    # If req not finished in EngineCore, but Detokenizer
+                    # detected stop string, abort needed in EngineCore.
+                    reqs_to_abort.append(req_id)
+                # Track per-request stats
+                self._update_stats_from_finished(req_state, finish_reason,
+                                                 iteration_stats)
+        self.lora_states.update_iteration_stats(iteration_stats)
+        return OutputProcessorOutput(
+            request_outputs=request_outputs,
+            reqs_to_abort=reqs_to_abort,
+        )
+    def _update_stats_from_output(self, req_state: RequestState,
+                                  engine_core_output: EngineCoreOutput,
+                                  engine_core_timestamp: Optional[float],
+                                  iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+        lora_stats = self.lora_states.get_stats(req_state)
+        assert engine_core_timestamp is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_output(engine_core_output,
+                                           engine_core_timestamp,
+                                           req_state.is_prefilling,
+                                           req_state.prompt_len,
+                                           req_state.stats, lora_stats)
+    def _update_stats_from_finished(self, req_state: RequestState,
+                                    finish_reason: Optional[FinishReason],
+                                    iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+        assert finish_reason is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_finished_request(
+            finish_reason=finish_reason,
+            num_prompt_tokens=len(req_state.prompt_token_ids),
+            max_tokens_param=req_state.max_tokens_param,
+            req_stats=req_state.stats)
+        self.lora_states.finish_request(req_state)
+        ParentRequest.observe_finished_request(
+            req_state.parent_req, iteration_stats,
+            req_state.stats.num_generation_tokens)