PyPI - xinference - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

xinference 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (210) hide show

xinference/model/llm/vllm/xavier/allocator.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.interfaces import DeviceAwareBlockAllocator
+from vllm.platforms import current_platform
+from vllm.utils import Device
+from .block import XavierPrefixCachingBlockAllocator
+class XavierCpuGpuBlockAllocator(CpuGpuBlockAllocator):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._xavier_config: Optional[Dict[str, Any]] = None
+    @property
+    def xavier_config(self):
+        return self._xavier_config
+    @xavier_config.setter
+    def xavier_config(self, v: Dict[str, Any]):
+        self._xavier_config = v
+        self._allocators[Device.GPU].xavier_config = v
+    @staticmethod
+    def create(
+        allocator_type: str,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+        block_size: int,
+    ) -> DeviceAwareBlockAllocator:
+        """Xinference Change!!!
+        1. The code is copied here because the `allocator` needs to be instantiated as a subclass.
+        2. Why not re-instantiate it externally?
+        Re-instantiating the `allocator` is costly because it requires initializing many tensors.
+        """
+        # For HPU, block id 0 is used only for padding
+        reserved_blocks = 1 if current_platform.is_hpu() else 0
+        block_ids = list(range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
+        num_gpu_blocks -= reserved_blocks
+        gpu_block_ids = block_ids[:num_gpu_blocks]
+        cpu_block_ids = block_ids[num_gpu_blocks:]
+        gpu_allocator = XavierPrefixCachingBlockAllocator(
+            run_isolation=True,
+            num_blocks=num_gpu_blocks,
+            block_size=block_size,
+            block_ids=gpu_block_ids,
+        )
+        cpu_allocator = XavierPrefixCachingBlockAllocator(
+            num_blocks=num_cpu_blocks,
+            block_size=block_size,
+            block_ids=cpu_block_ids,
+        )
+        return XavierCpuGpuBlockAllocator(
+            cpu_block_allocator=cpu_allocator,
+            gpu_block_allocator=gpu_allocator,
+        )

xinference/model/llm/vllm/xavier/block.py ADDED Viewed

@@ -0,0 +1,112 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+from typing import Any, Dict, Optional
+import xoscar as xo
+from vllm.core.block.interfaces import BlockId
+from vllm.core.block.prefix_caching_block import (
+    BlockTracker,
+    PrefixCachingBlockAllocator,
+)
+from .....isolation import Isolation
+logger = logging.getLogger(__name__)
+class XavierInnerBlockTracker(BlockTracker):
+    """Used to track the status of a block inside the prefix caching allocator"""
+    """
+    Here, two fixed attributes, `transferred` and `executed`,
+    have been added to the `BlockTracker` class to mark the status of the corresponding `block_id`.
+    We cannot directly set attributes on the `Block` object
+    because the `Block` objects are dynamically allocated with each scheduling.
+    The `Block` objects executed in two different scheduling steps may have the same `id`, `hash`, etc.,
+    but the instance objects may differ.
+    The BlockTracker object inside vllm is one-to-one with the block_id.
+    """
+    __slots__ = ("active", "last_accessed", "computed", "transferred", "executed")
+    def __init__(self):
+        super().__init__()
+        self.transferred = False
+        self.executed = False
+class XavierPrefixCachingBlockAllocator(PrefixCachingBlockAllocator):
+    def __init__(self, *args, run_isolation: bool = False, **kwargs):
+        super().__init__(*args, **kwargs)
+        for _id in self._block_tracker.keys():
+            self._block_tracker[_id] = XavierInnerBlockTracker()
+        self._xavier_config: Optional[Dict[str, Any]] = None
+        self._block_tracker_ref = None
+        if run_isolation:
+            self._isolation = Isolation(
+                asyncio.new_event_loop(), threaded=True, daemon=True
+            )
+            self._isolation.start()
+        else:
+            self._isolation = None  # type: ignore
+    def __del__(self):
+        if self._isolation is not None:
+            self._isolation.stop()
+    @property
+    def xavier_config(self):
+        return self._xavier_config
+    @xavier_config.setter
+    def xavier_config(self, v: Dict[str, Any]):
+        self._xavier_config = v
+    async def _get_block_tracker_ref(self):
+        from .block_tracker import VLLMBlockTracker
+        if self._block_tracker_ref is None:
+            block_tracker_address = self.xavier_config.get("block_tracker_address")
+            self._block_tracker_ref = await xo.actor_ref(
+                address=block_tracker_address, uid=VLLMBlockTracker.default_uid()
+            )
+        return self._block_tracker_ref
+    async def unregister_block(self, block_id: int):
+        assert self._xavier_config is not None
+        tracker_ref = await self._get_block_tracker_ref()
+        await tracker_ref.unregister_block(
+            self.xavier_config.get("virtual_engine"),
+            self.xavier_config.get("rank_address"),
+            block_id,
+        )
+    def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
+        """
+        This is the only entry point where the `block_id` is evicted from the cache.
+        Therefore, when the `block_id` is evicted, the tracker actor needs to unregister the block information.
+        At the same time, make sure to reset the attributes corresponding to that `block_id`.
+        """
+        evicted_block_id = super()._maybe_allocate_evicted_block_id()
+        logger.debug(f"block_id: {evicted_block_id} will be evicted from the cache.")
+        if evicted_block_id is not None and self._isolation is not None:
+            tracker = self._block_tracker[evicted_block_id]
+            assert isinstance(tracker, XavierInnerBlockTracker)
+            tracker.transferred = False
+            tracker.executed = False
+            self._isolation.call(self.unregister_block(evicted_block_id))
+            logger.debug(f"block_id: {evicted_block_id} will be used again.")
+        return evicted_block_id

xinference/model/llm/vllm/xavier/block_manager.py ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Any, Dict, Optional
+from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.interfaces import Block
+from vllm.core.block_manager import SelfAttnBlockSpaceManager
+from vllm.sequence import SequenceGroup, SequenceStatus
+from vllm.utils import Device
+from .allocator import XavierCpuGpuBlockAllocator
+logger = logging.getLogger(__name__)
+class XavierBlockManager(SelfAttnBlockSpaceManager):
+    def __init__(self, *args, **kwargs):
+        # Monkey patch
+        CpuGpuBlockAllocator.create = XavierCpuGpuBlockAllocator.create
+        super().__init__(*args, **kwargs)
+        self._xavier_config: Optional[Dict[str, Any]] = None
+        logger.debug("Init xavier block manager done.")
+    @property
+    def xavier_config(self):
+        return self._xavier_config
+    @xavier_config.setter
+    def xavier_config(self, value: Dict[str, Any]):
+        self._xavier_config = value
+        self.block_allocator.xavier_config = value
+    def get_block_by_block_id(self, seq_id: int, block_id: int) -> Block:
+        table = self.block_tables[seq_id]
+        for b in table.blocks:
+            if b.block_id == block_id:
+                return b
+    def get_block_status_by_block_id(self, status_name: str, block_id: int) -> bool:
+        tracker = self.block_allocator._allocators[Device.GPU]._block_tracker[block_id]
+        return getattr(tracker, status_name)
+    def set_block_status_by_block_id(
+        self, status_name: str, block_id: int, status: bool
+    ) -> None:
+        tracker = self.block_allocator._allocators[Device.GPU]._block_tracker[block_id]
+        assert getattr(tracker, status_name, None) is not None
+        setattr(tracker, status_name, status)
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        """
+        If the `seq_group` has the `transferred` attribute,
+        it indicates that the `seq_group` has gone through the transfer process,
+        so the block allocation logic should not be executed again.
+        """
+        waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
+        if all([getattr(s, "transferred", False) for s in waiting_seqs]):
+            return
+        super().allocate(seq_group)

xinference/model/llm/vllm/xavier/block_tracker.py ADDED Viewed

@@ -0,0 +1,116 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from typing import Dict, List, Optional, Set, Tuple
+import xoscar as xo
+class VLLMBlockTracker(xo.StatelessActor):
+    @classmethod
+    def default_uid(cls):
+        return f"vllm-block-tracker-actor"
+    def __init__(self):
+        super().__init__()
+        # engine -> hash_to_address_and_block_id
+        self._hash_to_address_and_block_id: Dict[
+            int, Dict[int, Set[Tuple[str, int]]]
+        ] = {}
+        # engine -> address_to_hash_and_block_id
+        self._address_to_hash_and_block_id: Dict[
+            int, Dict[str, Set[Tuple[int, int]]]
+        ] = {}
+    def register_blocks(
+        self, virtual_engine: int, block_infos: List[Tuple[int, int]], address: str
+    ):
+        # Update query meta
+        if virtual_engine not in self._hash_to_address_and_block_id:
+            self._hash_to_address_and_block_id[virtual_engine] = {}
+        hash_to_address_and_block_id = self._hash_to_address_and_block_id[
+            virtual_engine
+        ]
+        for hash_content, block_id in block_infos:
+            if hash_content not in hash_to_address_and_block_id:
+                hash_to_address_and_block_id[hash_content] = {
+                    (address, block_id),
+                }
+            else:
+                hash_to_address_and_block_id[hash_content].add((address, block_id))
+        # Update remove meta
+        if virtual_engine not in self._address_to_hash_and_block_id:
+            self._address_to_hash_and_block_id[virtual_engine] = {}
+        address_to_hash_and_block_id = self._address_to_hash_and_block_id[
+            virtual_engine
+        ]
+        if address not in address_to_hash_and_block_id:
+            address_to_hash_and_block_id[address] = set()
+        address_to_hash_and_block_id[address].update(block_infos)
+    def query_blocks(
+        self, virtual_engine: int, hash_contents: List[Tuple[int, int]]
+    ) -> Dict[str, Set[Tuple[int, int, int]]]:
+        if virtual_engine not in self._hash_to_address_and_block_id:
+            return {}
+        hash_to_address_and_block_id = self._hash_to_address_and_block_id[
+            virtual_engine
+        ]
+        remote: Dict[str, Set[Tuple[int, int, int]]] = {}
+        for hash_content, _id in hash_contents:
+            if (
+                hash_content in hash_to_address_and_block_id
+            ) and hash_to_address_and_block_id[hash_content]:
+                # TODO: Randomly select here, and try to distribute requests as evenly as possible.
+                # There may be better methods in the future.
+                address, block_id = random.choice(
+                    list(hash_to_address_and_block_id[hash_content])
+                )
+                if address not in remote:
+                    remote[address] = {
+                        (hash_content, block_id, _id),
+                    }
+                else:
+                    remote[address].add((hash_content, block_id, _id))
+        return remote
+    def unregister_block(self, virtual_engine: int, address: str, block_id: int):
+        if (virtual_engine not in self._address_to_hash_and_block_id) or (
+            virtual_engine not in self._hash_to_address_and_block_id
+        ):
+            return
+        # Update remove meta
+        address_to_hash_and_block_id = self._address_to_hash_and_block_id[
+            virtual_engine
+        ]
+        if address not in address_to_hash_and_block_id:
+            return
+        hash_and_block_id = address_to_hash_and_block_id[address]
+        detail: Optional[Tuple[int, int]] = None
+        for hash_content, _id in hash_and_block_id.copy():
+            if _id == block_id:
+                detail = (hash_content, block_id)
+                hash_and_block_id.discard(detail)
+                break
+        # Update query meta
+        if detail is not None:
+            hash_to_address_and_block_id = self._hash_to_address_and_block_id[
+                virtual_engine
+            ]
+            _hash = detail[0]
+            if _hash in hash_to_address_and_block_id:
+                hash_to_address_and_block_id[_hash].discard((address, detail[1]))

xinference/model/llm/vllm/xavier/engine.py ADDED Viewed

@@ -0,0 +1,247 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Dict, List, Optional, Type, Union
+from vllm import AsyncEngineArgs, EmbeddingRequestOutput, RequestOutput
+from vllm.config import VllmConfig
+from vllm.engine.async_llm_engine import AsyncLLMEngine, _AsyncLLMEngine
+from vllm.engine.llm_engine import SchedulerOutputState
+from vllm.engine.metrics_types import StatLoggerBase
+from vllm.executor.executor_base import ExecutorBase
+from vllm.sequence import ExecuteModelRequest
+from vllm.usage.usage_lib import UsageContext
+from .executor import XavierExecutor
+from .scheduler import XavierScheduler
+logger = logging.getLogger(__name__)
+class XavierInternalEngine(_AsyncLLMEngine):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._xavier_config = kwargs["vllm_config"].xavier_config
+        self.scheduler = [
+            XavierScheduler(
+                self.scheduler_config,
+                self.cache_config,
+                self.lora_config,
+                self.parallel_config.pipeline_parallel_size,
+                self.async_callbacks[v_id]
+                if self.model_config.use_async_output_proc
+                else None,
+                xavier_config=self._xavier_config,
+                virtual_engine=v_id,
+            )
+            for v_id in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.output_processor.scheduler = self.scheduler
+        self.model_executor.scheduler = self.scheduler
+    async def step_async(
+        self, virtual_engine: int
+    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+        """Performs one decoding iteration and returns newly generated results.
+        The workers are ran asynchronously if possible.
+        This function performs one decoding iteration of the engine. It first
+        schedules the sequences to be executed in the next iteration and the
+        token blocks to be swapped in/out/copy. Then, it executes the model
+        and updates the scheduler with the model outputs. Finally, it decodes
+        the sequences and returns the newly generated results.
+        """
+        # these are cached outputs from previous iterations. None if on first
+        # iteration
+        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
+        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
+        scheduler_outputs = cached_outputs.scheduler_outputs
+        allow_async_output_proc = cached_outputs.allow_async_output_proc
+        ctx = self.scheduler_contexts[virtual_engine]
+        # Clear outputs for each new scheduler iteration
+        ctx.request_outputs.clear()
+        # skip the scheduler if there are any remaining steps in the seq groups.
+        # This ensures that the scheduler is only called again when the current
+        # batch has completed.
+        if not self._has_remaining_steps(seq_group_metadata_list):
+            # Schedule iteration
+            """Xinference Change!!!
+            Why copy the entire function code of vllm:
+            The purpose here is to modify the way the `schedule` function is invoked to asynchronous calling.
+            No other modifications were made elsewhere.
+            """
+            (
+                seq_group_metadata_list,
+                scheduler_outputs,
+                allow_async_output_proc,
+            ) = await self.scheduler[virtual_engine].schedule()
+            ctx.seq_group_metadata_list = seq_group_metadata_list
+            ctx.scheduler_outputs = scheduler_outputs
+            # Maybe switch from async mode to sync mode
+            if not allow_async_output_proc and len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+            if (
+                self.scheduler_config.is_multi_step
+                and scheduler_outputs.num_lookahead_slots > 0
+            ):
+                # cache the scheduler outputs for the next iteration if we have
+                # lookahead slots
+                self._cache_scheduler_outputs_for_multi_step(
+                    virtual_engine,
+                    seq_group_metadata_list,
+                    scheduler_outputs,
+                    allow_async_output_proc,
+                )
+        assert seq_group_metadata_list is not None
+        assert scheduler_outputs is not None
+        if not scheduler_outputs.is_empty():
+            finished_requests_ids = self.scheduler[
+                virtual_engine
+            ].get_and_reset_finished_requests_ids()
+            # Check if we have a cached last_output from the previous iteration.
+            # For supporting PP this is probably the best way to pass the
+            # sampled_token_ids, as a separate broadcast over all the PP stages
+            # will cause one virtual engine's microbatch to block the pipeline.
+            last_sampled_token_ids = self._get_last_sampled_token_ids(virtual_engine)
+            execute_model_req = ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+                blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                virtual_engine=virtual_engine,
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
+                running_queue_size=scheduler_outputs.running_queue_size,
+                finished_requests_ids=finished_requests_ids,
+                # We use ExecuteModelRequest to pass the last sampled_token_ids
+                # to each of the non-last PP stages for in-place prepare_input.
+                last_sampled_token_ids=last_sampled_token_ids,
+            )
+            if allow_async_output_proc:
+                execute_model_req.async_callback = self.async_callbacks[virtual_engine]
+            # Execute the model.
+            outputs = await self.model_executor.execute_model_async(execute_model_req)
+            # we need to do this here so that last step's sampled_token_ids can
+            # be passed to the next iteration for PP.
+            if self.scheduler_config.is_multi_step:
+                self._update_cached_scheduler_output(virtual_engine, outputs)
+        else:
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+            outputs = []
+        # Finish the current step for all the sequence groups.
+        if self.scheduler_config.is_multi_step:
+            for seq_group in seq_group_metadata_list:
+                seq_group.finish_step()
+        if not self._has_remaining_steps(seq_group_metadata_list):
+            # Clear the cache if we have finished all the steps
+            if self.scheduler_config.is_multi_step:
+                self.cached_scheduler_outputs[virtual_engine] = SchedulerOutputState()
+            # is_first_step_output is True only when the num_steps of all
+            # the sequences are 1. When the num_steps > 1,
+            # multi_step_model_runner does the first-step output append.
+            is_first_step_output: bool = (
+                False
+                if not seq_group_metadata_list
+                else seq_group_metadata_list[0].state.num_steps == 1
+            )
+            ctx.append_output(
+                outputs=outputs,
+                seq_group_metadata_list=seq_group_metadata_list,
+                scheduler_outputs=scheduler_outputs,
+                is_async=allow_async_output_proc,
+                is_last_step=True,
+                is_first_step_output=is_first_step_output,
+            )
+            if outputs and allow_async_output_proc:
+                assert (
+                    len(outputs) == 1
+                ), "Async postprocessor expects only a single output set"
+                self._advance_to_next_step(
+                    outputs[0],
+                    seq_group_metadata_list,
+                    scheduler_outputs.scheduled_seq_groups,
+                )
+            if not allow_async_output_proc:
+                self._process_model_outputs(ctx=ctx)
+                # Log stats.
+                self.do_log_stats(scheduler_outputs, outputs)
+                # Tracing
+                self.do_tracing(scheduler_outputs)
+        else:
+            # Multi-step case
+            return ctx.request_outputs
+        if not self.has_unfinished_requests():
+            # Drain async postprocessor (if exists)
+            if len(ctx.output_queue) > 0:
+                self._process_model_outputs(ctx=ctx)
+            assert len(ctx.output_queue) == 0
+        return ctx.request_outputs
+class XavierEngine(AsyncLLMEngine):
+    _engine_class: Type[_AsyncLLMEngine] = XavierInternalEngine
+    _xavier_config: Optional[Dict] = None
+    @classmethod
+    def _get_executor_cls(cls, engine_config: VllmConfig) -> Type[ExecutorBase]:
+        logger.debug(f"Initializing Xavier executor.")
+        return XavierExecutor
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        engine_config: Optional[VllmConfig] = None,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        xavier_config: Optional[Dict] = None,
+    ) -> "AsyncLLMEngine":
+        cls._xavier_config = xavier_config
+        return super().from_engine_args(
+            engine_args, engine_config, start_engine_loop, usage_context, stat_loggers
+        )
+    def __init__(self, *args, **kwargs):
+        # set xavier_config to `vllm_config`,
+        # because it may be needed everywhere in the vllm internal components
+        kwargs["vllm_config"].xavier_config = self._xavier_config
+        super().__init__(*args, **kwargs)
+    async def init_xavier(self):
+        await self.engine.model_executor.init_transfer()

xinference 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

xinference 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl