PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/platforms/tpu.py ADDED Viewed

@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Optional, Union, cast
+import torch
+from tpu_info import device
+from vllm.inputs import ProcessorInputs, PromptType
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+from .interface import Platform, PlatformEnum, _Backend
+if TYPE_CHECKING:
+    from vllm.config import BlockSize, ModelConfig, VllmConfig
+    from vllm.pooling_params import PoolingParams
+else:
+    BlockSize = None
+    ModelConfig = None
+    VllmConfig = None
+    PoolingParams = None
+logger = init_logger(__name__)
+USE_TPU_COMMONS = False
+class TpuPlatform(Platform):
+    _enum = PlatformEnum.TPU
+    device_name: str = "tpu"
+    device_type: str = "tpu"
+    dispatch_key: str = "XLA"
+    ray_device_key: str = "TPU"
+    dist_backend: str = "gloo"
+    device_control_env_var: str = "TPU_VISIBLE_CHIPS"
+    simple_compile_backend: str = "openxla"
+    supported_quantization: list[str] = [
+        "fp8", "tpu_int8", "compressed-tensors"
+    ]
+    additional_env_vars: list[str] = [
+        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
+    ]
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink, use_sparse) -> str:
+        if use_sparse:
+            raise NotImplementedError(
+                "Sparse Attention is not supported on TPU.")
+        if selected_backend != _Backend.PALLAS:
+            logger.info("Cannot use %s backend on TPU.", selected_backend)
+        if not use_v1:
+            raise ValueError("TPU backend only supports V1.")
+        logger.info("Using Pallas V1 backend.")
+        return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.tpu.set_device(device)
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        chip_type, _ = device.get_local_chips()
+        return f"TPU {chip_type.name}"
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU"
+    @classmethod
+    def get_infinity_values(cls, dtype: torch.dtype) -> tuple[float, float]:
+        return torch.finfo(dtype).min, torch.finfo(dtype).max
+    @classmethod
+    def can_update_inplace(cls):
+        return False
+    @classmethod
+    def get_lora_vocab_padding_size(cls) -> int:
+        return 1
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.config import CompilationLevel, CUDAGraphMode
+        cache_config = vllm_config.cache_config
+        # For v0, the default block size is 16.
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = cast(BlockSize, 16)
+        compilation_config = vllm_config.compilation_config
+        # TPU only supports DYNAMO_ONCE compilation level
+        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
+            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level, and "
+                        "disabling cudagraph.")
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+        if compilation_config.cudagraph_mode is None or \
+                compilation_config.cudagraph_mode.max_cudagraph_mode() \
+                    != CUDAGraphMode.NONE:
+            logger.info("[TPU] CUDA graph is not supported on TPU, "
+                        "disabling cudagraphs.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+        if compilation_config.backend == "":
+            compilation_config.backend = "openxla"
+        assert vllm_config.speculative_config is None, \
+            "TPU does not support speculative decoding"
+        model_config = vllm_config.model_config
+        if model_config is not None and model_config.dtype in (torch.float16,
+                                                               torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", model_config.dtype)
+            model_config.dtype = torch.bfloat16
+        from vllm.v1.attention.backends.pallas import PallasAttentionBackend
+        cache_config.block_size = PallasAttentionBackend.get_page_size(
+            vllm_config)  # type: ignore[assignment]
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if scheduler_config.is_multimodal_model and not \
+                scheduler_config.disable_chunked_mm_input:
+            logger.warning("TPU does not support running Multimodal models"\
+            " without setting `--disable_chunked_mm_input`. " \
+            "Forcing --disable_chunked_mm_input.")
+            scheduler_config.disable_chunked_mm_input = True
+        if model_config and model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on TPU.")
+        return False
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
+    @classmethod
+    def validate_request(
+        cls,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
+    ) -> None:
+        """Raises if this request is unsupported on this platform"""
+        if (isinstance(params, SamplingParams)
+                and params.sampling_type == SamplingType.RANDOM_SEED):
+            raise ValueError("Torch XLA does not support per-request seed.")
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
+        return True
+    @classmethod
+    @torch.compile(backend="openxla")
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        torch.ops.xla.dynamo_set_buffer_donor_(dst_cache, True)
+        dst_cache[dst_block_indices] = src_cache[src_block_indices].to(
+            dst_cache.device)
+    @classmethod
+    @torch.compile(backend="openxla")
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """ tpu blocks to cpu blocks"""
+        torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True)
+        dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
+    @classmethod
+    def use_sync_weight_loader(cls) -> bool:
+        return True
+try:
+    from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
+    TpuPlatform = TpuCommonsPlatform  # type: ignore
+    USE_TPU_COMMONS = True
+except ImportError:
+    logger.info("tpu_commons not found, using vLLM's TpuPlatform")
+    pass

vllm/platforms/xpu.py ADDED Viewed

@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import TYPE_CHECKING, Optional
+import torch
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
+from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+else:
+    ModelConfig = None
+    VllmConfig = None
+logger = init_logger(__name__)
+class XPUPlatform(Platform):
+    _enum = PlatformEnum.XPU
+    device_name: str = "xpu"
+    device_type: str = "xpu"
+    dispatch_key: str = "XPU"
+    # Intel XPU's device key is "GPU" for Ray.
+    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
+    ray_device_key: str = "GPU"
+    dist_backend: str = "ccl"  # ccl | xccl
+    device_control_env_var: str = "ZE_AFFINITY_MASK"
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool, use_sparse) -> str:
+        if use_sparse:
+            raise NotImplementedError(
+                "Sparse Attention is not supported on XPU.")
+        use_v1 = envs.VLLM_USE_V1
+        if not use_v1:
+            raise ValueError("XPU backend only supports V1.")
+        TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
+        FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+        if selected_backend == _Backend.TRITON_ATTN:
+            logger.info_once("Using Triton backend on V1 engine.")
+            return TRITON_ATTN
+        elif selected_backend == _Backend.FLASH_ATTN:
+            logger.info_once("Using Flash Attention backend on V1 engine.")
+            return FLASH_ATTN
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
+        logger.info("Using Flash Attention backend on V1 engine.")
+        return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
+        """
+        Check if the kv_cache_dtype is supported.
+        XPU only support fp8 kv cache with triton backend.
+        """
+        if envs.is_set("VLLM_ATTENTION_BACKEND") and \
+            envs.VLLM_ATTENTION_BACKEND == "TRITON_ATTN":
+            return kv_cache_dtype in ["fp8_e4m3", "fp8_e5m2", "fp8"]
+        return False
+    @classmethod
+    def set_device(cls, device: torch.device) -> None:
+        """
+        Set the device for the current platform.
+        """
+        torch.xpu.set_device(device)
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
+        # capacity format differs from cuda's and will cause unexpected
+        # failure, so use None directly
+        return None
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.xpu.get_device_name(device_id)
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.xpu.get_device_properties(device_id)
+        return device_props.total_memory
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        cache_config = vllm_config.cache_config
+        model_config = vllm_config.model_config
+        # in V1(or with ipex chunked prefill) block_size is 64
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 64
+        # lazy import to avoid circular import
+        from vllm.config import CompilationLevel, CUDAGraphMode
+        compilation_config = vllm_config.compilation_config
+        if compilation_config.compile_sizes is None:
+            compilation_config.compile_sizes = []
+        assert compilation_config.cudagraph_mode == CUDAGraphMode.NONE, \
+            "CUDA graph mode should be NONE on XPU"
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
+        # check and update parallel config
+        parallel_config = vllm_config.parallel_config
+        parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
+        if parallel_config.distributed_executor_backend is None:
+            if parallel_config.world_size > 1:
+                parallel_config.distributed_executor_backend = "ray"
+            else:
+                parallel_config.distributed_executor_backend = "uni"
+        elif parallel_config.distributed_executor_backend == "mp":
+            # FIXME(kunshang):
+            # spawn needs calling `if __name__ == '__main__':``
+            # fork is not supported for xpu start new process.
+            if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
+                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+                logger.warning(
+                    "Please use spawn as start method if you want to use mp.")
+        elif (parallel_config.distributed_executor_backend != "ray"
+              and parallel_config.distributed_executor_backend != "uni"
+              and parallel_config.distributed_executor_backend
+              != "external_launcher"):
+            logger.warning(
+                "%s is not supported on XPU, fallback to ray distributed"
+                " executor backend.",
+                parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "ray"
+        if model_config and model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+        set_kv_cache_layout("NHD")
+        logger.info("Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
+                    "only NHD layout is supported by XPU attention kernels.")
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
+    @classmethod
+    def support_static_graph_mode(cls) -> bool:
+        return False
+    @classmethod
+    def is_pin_memory_available(cls):
+        return True
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.xpu.reset_peak_memory_stats(device)
+        return torch.xpu.max_memory_allocated(device)
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        return torch.float8_e5m2
+    @classmethod
+    def is_data_center_gpu(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        return device_name.count("data center gpu") > 0
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
+    @classmethod
+    def device_count(cls) -> int:
+        return torch.xpu.device_count()
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+            device_name = cls.get_device_name().lower()
+            # client gpu a770
+            if device_name.count("a770") > 0:
+                raise ValueError(
+                    "Intel Arc A770 have bfloat16 accuracy known issue. "
+                    "You can use float16 instead by explicitly setting the "
+                    "`dtype` flag in CLI, for example: --dtype=half.")
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+    @classmethod
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from src_cache to dst_cache on XPU."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
+    @classmethod
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from XPU to host (CPU)."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.cpu()

vllm/plugins/__init__.py ADDED Viewed

@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from typing import Any, Callable
+import vllm.envs as envs
+logger = logging.getLogger(__name__)
+DEFAULT_PLUGINS_GROUP = 'vllm.general_plugins'
+# make sure one process only loads plugins once
+plugins_loaded = False
+def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
+    import sys
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+    allowed_plugins = envs.VLLM_PLUGINS
+    discovered_plugins = entry_points(group=group)
+    if len(discovered_plugins) == 0:
+        logger.debug("No plugins for group %s found.", group)
+        return {}
+    # Check if the only discovered plugin is the default one
+    is_default_group = (group == DEFAULT_PLUGINS_GROUP)
+    # Use INFO for non-default groups and DEBUG for the default group
+    log_level = logger.debug if is_default_group else logger.info
+    log_level("Available plugins for group %s:", group)
+    for plugin in discovered_plugins:
+        log_level("- %s -> %s", plugin.name, plugin.value)
+    if allowed_plugins is None:
+        log_level("All plugins in this group will be loaded. "
+                  "Set `VLLM_PLUGINS` to control which plugins to load.")
+    plugins = dict[str, Callable[[], Any]]()
+    for plugin in discovered_plugins:
+        if allowed_plugins is None or plugin.name in allowed_plugins:
+            if allowed_plugins is not None:
+                log_level("Loading plugin %s", plugin.name)
+            try:
+                func = plugin.load()
+                plugins[plugin.name] = func
+            except Exception:
+                logger.exception("Failed to load plugin %s", plugin.name)
+    return plugins
+def load_general_plugins():
+    """WARNING: plugins can be loaded for multiple times in different
+    processes. They should be designed in a way that they can be loaded
+    multiple times without causing issues.
+    """
+    global plugins_loaded
+    if plugins_loaded:
+        return
+    plugins_loaded = True
+    plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()

vllm/plugins/io_processors/__init__.py ADDED Viewed

@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+import logging
+from typing import Optional
+from vllm.config import VllmConfig
+from vllm.plugins import load_plugins_by_group
+from vllm.plugins.io_processors.interface import IOProcessor
+from vllm.utils import resolve_obj_by_qualname
+logger = logging.getLogger(__name__)
+def get_io_processor(
+        vllm_config: VllmConfig,
+        plugin_from_init: Optional[str] = None) -> IOProcessor | None:
+    # Input.Output processors are loaded as plugins under the
+    # 'vllm.io_processor_plugins' group. Similar to platform
+    # plugins, these plugins register a function that returns the class
+    # name for the processor to install.
+    if plugin_from_init:
+        model_plugin = plugin_from_init
+    else:
+        # A plugin can be specified via the model config
+        # Retrieve the model specific plugin if available
+        # This is using a custom field in the hf_config for the model
+        hf_config = vllm_config.model_config.hf_config.to_dict()
+        config_plugin = hf_config.get("io_processor_plugin")
+        model_plugin = config_plugin
+    if model_plugin is None:
+        logger.debug("No IOProcessor plugins requested by the model")
+        return None
+    logger.debug("IOProcessor plugin to be loaded %s", model_plugin)
+    # Load all installed plugin in the group
+    multimodal_data_processor_plugins = \
+        load_plugins_by_group('vllm.io_processor_plugins')
+    loadable_plugins = {}
+    for name, func in multimodal_data_processor_plugins.items():
+        try:
+            assert callable(func)
+            processor_cls_qualname = func()
+            if processor_cls_qualname is not None:
+                loadable_plugins[name] = processor_cls_qualname
+        except Exception:
+            logger.warning("Failed to load plugin %s.", name, exc_info=True)
+    num_available_plugins = len(loadable_plugins.keys())
+    if num_available_plugins == 0:
+        raise ValueError("No IOProcessor plugins installed"
+                         f" but one is required ({model_plugin}).")
+    if model_plugin not in loadable_plugins:
+        raise ValueError(
+            f"The model requires the '{model_plugin}' IO Processor plugin "
+            "but it is not installed. "
+            f"Available plugins: {list(loadable_plugins.keys())}")
+    activated_plugin_cls = loadable_plugins[model_plugin]
+    return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config)

vllm/plugins/io_processors/interface.py ADDED Viewed

@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator, Sequence
+from typing import Any, Generic, Optional, TypeVar, Union
+from vllm.config import VllmConfig
+from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.inputs.data import PromptType
+from vllm.outputs import PoolingRequestOutput
+IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorOutput = TypeVar('IOProcessorOutput')
+class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+    @abstractmethod
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        raise NotImplementedError
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+    @abstractmethod
+    def post_process(self,
+                     model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
+                     **kwargs) -> IOProcessorOutput:
+        raise NotImplementedError
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        # We cannot guarantee outputs are returned in the same order they were
+        # fed to vLLM.
+        # Let's sort them by id before post_processing
+        sorted_output = sorted([(i, item) async for i, item in model_output],
+                               key=lambda output: output[0])
+        collected_output = [output[1] for output in sorted_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+    @abstractmethod
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        raise NotImplementedError
+    @abstractmethod
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        raise NotImplementedError