PyPI - ipex-llm - Versions diffs - 2.2.0b20250120__py3-none-win_amd64.whl → 2.2.0b20250122__py3-none-win_amd64.whl - Mend

ipex-llm 2.2.0b20250120__py3-none-win_amd64.whl → 2.2.0b20250122__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

ipex_llm/libs/bloom-api.dll +0 -0
ipex_llm/libs/bloom.dll +0 -0
ipex_llm/libs/gptneox-api.dll +0 -0
ipex_llm/libs/gptneox.dll +0 -0
ipex_llm/libs/libbloom_avx.dll +0 -0
ipex_llm/libs/libbloom_vnni.dll +0 -0
ipex_llm/libs/libgptneox_avx.dll +0 -0
ipex_llm/libs/libgptneox_vnni.dll +0 -0
ipex_llm/libs/libllama_avx.dll +0 -0
ipex_llm/libs/libllama_vnni.dll +0 -0
ipex_llm/libs/libstarcoder_avx.dll +0 -0
ipex_llm/libs/libstarcoder_vnni.dll +0 -0
ipex_llm/libs/llama-api.dll +0 -0
ipex_llm/libs/llama.dll +0 -0
ipex_llm/libs/main-bloom.exe +0 -0
ipex_llm/libs/main-gptneox.exe +0 -0
ipex_llm/libs/main-llama.exe +0 -0
ipex_llm/libs/main-starcoder.exe +0 -0
ipex_llm/libs/pipeline.dll +0 -0
ipex_llm/libs/quantize-bloom.exe +0 -0
ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
ipex_llm/libs/quantize-gptneox.exe +0 -0
ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
ipex_llm/libs/quantize-llama.exe +0 -0
ipex_llm/libs/quantize-llama_vnni.exe +0 -0
ipex_llm/libs/quantize-starcoder.exe +0 -0
ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
ipex_llm/libs/starcoder-api.dll +0 -0
ipex_llm/libs/starcoder.dll +0 -0
ipex_llm/transformers/convert.py +0 -1
ipex_llm/transformers/low_bit_linear.py +8 -5
ipex_llm/transformers/model.py +1 -3
ipex_llm/transformers/patches.py +0 -11
ipex_llm/transformers/utils.py +16 -10
ipex_llm/vllm/cpu/engine/__init__.py +2 -1
ipex_llm/vllm/cpu/engine/engine.py +159 -75
ipex_llm/vllm/cpu/entrypoints/api_server.py +787 -0
ipex_llm/vllm/cpu/entrypoints/openai/api_server.py +680 -95
ipex_llm/vllm/cpu/entrypoints/openai/cli_args.py +277 -0
ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py +23 -0
ipex_llm/vllm/cpu/ipex_llm_wrapper.py +24 -0
ipex_llm/vllm/cpu/model_convert.py +126 -233
{ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/METADATA +20 -20
{ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/RECORD +50 -46
{ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/ipex-llm-init.bat +0 -0
{ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/llm-chat.ps1 +0 -0
{ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/llm-cli.ps1 +0 -0
{ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/WHEEL +0 -0
{ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/entry_points.txt +0 -0
{ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/top_level.txt +0 -0

ipex_llm/libs/bloom-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/bloom.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/gptneox.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libbloom_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libgptneox_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libllama_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_avx.dll CHANGED Viewed

Binary file

ipex_llm/libs/libstarcoder_vnni.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/llama.dll CHANGED Viewed

Binary file

ipex_llm/libs/main-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/main-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/pipeline.dll CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-bloom_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-gptneox_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-llama_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder.exe CHANGED Viewed

Binary file

ipex_llm/libs/quantize-starcoder_vnni.exe CHANGED Viewed

Binary file

ipex_llm/libs/starcoder-api.dll CHANGED Viewed

Binary file

ipex_llm/libs/starcoder.dll CHANGED Viewed

Binary file

ipex_llm/transformers/convert.py CHANGED Viewed

@@ -693,7 +693,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                             out_features,
                             mp_group,
                             None,
-                            None,
                             optimize_lm_head,
                             None
                         )

ipex_llm/transformers/low_bit_linear.py CHANGED Viewed

@@ -204,12 +204,15 @@ def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int
 def ggml_q_format_convet_xpu2cpu(tensor: torch.Tensor, num_elem: int, qtype: int):
-    invalidInputError(tensor.dtype == torch.uint8,
-                      "Input tensor must be uint8")
+    if qtype == NF4:
+        invalidInputError(tensor.dtype == torch.bfloat16,
+                          "NF4 Input tensor must be bfloat16")
+    else:
+        invalidInputError(tensor.dtype == torch.uint8,
+                          "Input tensor must be uint8")
     invalidInputError(tensor.device == torch.device('cpu'),
-                      "Input tensor must be uint8")
+                      "Input tensor must be on cpu")
     src = ctypes.c_void_p(tensor.data.data_ptr())
@@ -746,7 +749,7 @@ class LowBitLinear(nn.Linear):
                 dist.inference_all_reduce(result, group=self.mp_group)
             if self.bias is not None:
                 result += self.bias
-        return result
+        return result.to(x.dtype)
 class FP16Linear(nn.Linear):

ipex_llm/transformers/model.py CHANGED Viewed

@@ -51,7 +51,7 @@ from ipex_llm.transformers.gguf.api import load_gguf_model
 from .utils import logger, load_state_dict
 from .utils import extract_local_archive_file, get_local_shard_files, load_imatrix_data
-from .patches import patch_flash_attn_import, patch_sdpa_available
+from .patches import patch_flash_attn_import
 patched_training_mode = None
@@ -108,7 +108,6 @@ class _BaseAutoModelClass:
     @classmethod
     @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
-    @patch("transformers.modeling_utils.is_torch_sdpa_available", patch_sdpa_available, create=True)
     def from_pretrained(cls,
                         *args,
                         **kwargs):
@@ -531,7 +530,6 @@ class _BaseAutoModelClass:
     @classmethod
     @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
-    @patch("transformers.modeling_utils.is_torch_sdpa_available", patch_sdpa_available, create=True)
     def load_low_bit(cls,
                      pretrained_model_name_or_path,
                      *model_args,

ipex_llm/transformers/patches.py CHANGED Viewed

@@ -26,14 +26,3 @@ def patch_flash_attn_import(filename: str) -> List[str]:
     if "flash_attn" in imports:
         imports.remove("flash_attn")
     return imports
-def patch_sdpa_available() -> bool:
-    if IPEXImporter.is_xpu_version_installed():
-        return False
-    else:
-        try:
-            from transformers.utils import is_torch_sdpa_available
-            return is_torch_sdpa_available()
-        except ImportError:
-            return False

ipex_llm/transformers/utils.py CHANGED Viewed

@@ -139,19 +139,25 @@ def fix_key(key):
 def get_autocast_dtype(x):
-    if x.device.type == "xpu":
-        if torch.xpu.is_autocast_xpu_enabled():
-            return torch.xpu.get_autocast_xpu_dtype()
-        else:
-            return None
-    elif x.device.type == "cpu":
-        if torch.is_autocast_cpu_enabled():
-            return torch.get_autocast_cpu_dtype()
+    if torch.__version__ >= '2.3':
+        if torch.is_autocast_enabled(x.device.type):
+            return torch.get_autocast_dtype(x.device.type)
         else:
             return None
     else:
-        invalidInputError(False,
-                          f"Device {x.device} is not supported.")
+        if x.device.type == "xpu":
+            if torch.xpu.is_autocast_xpu_enabled():
+                return torch.xpu.get_autocast_xpu_dtype()
+            else:
+                return None
+        elif x.device.type == "cpu":
+            if torch.is_autocast_cpu_enabled():
+                return torch.get_autocast_cpu_dtype()
+            else:
+                return None
+        else:
+            invalidInputError(False,
+                              f"Device {x.device} is not supported.")
 def get_xpu_device_name(device: torch.device):

ipex_llm/vllm/cpu/engine/__init__.py CHANGED Viewed

@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass
+from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine
 __all__ = [
     "IPEXLLMAsyncLLMEngine",
     "IPEXLLMLLMEngine",
     "IPEXLLMClass",
+    "run_mp_engine",
 ]

ipex_llm/vllm/cpu/engine/engine.py CHANGED Viewed

@@ -13,18 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import List, Optional, Union
+from vllm.logger import init_logger
+from typing import Dict, Optional, Any, Union, Type
 from vllm.engine.llm_engine import LLMEngine
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.llm import LLM
-from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
-                                  usage_message)
 from vllm.utils import Counter
+from vllm.config import VllmConfig
+from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
+from vllm.usage.usage_lib import UsageContext
+from vllm.engine.metrics import StatLoggerBase
+from vllm.engine.multiprocessing.engine import MQLLMEngine
+import signal
+from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
+                                   TaskOption)
+from vllm.config import CompilationConfig
+from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+from vllm import envs
+from vllm.v1.engine.async_llm import AsyncLLM
+import os
-from ipex_llm.utils.common import invalidInputError
+logger = init_logger(__name__)
 class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
@@ -35,49 +45,43 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
     def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
+        engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        load_in_low_bit: Optional[str] = None,
+        load_in_low_bit: str = "sym_int4",
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
     ) -> "AsyncLLMEngine":
         """Creates an async LLM engine from the engine arguments."""
-        # Enable ipex-llm optimizations
-        engine_config = engine_args.create_engine_config()
-        from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
+        # Create the engine configs.
         _ipex_llm_convert(load_in_low_bit)
-        if engine_config.device_config.device_type == "neuron":
-            from vllm.executor.neuron_executor import NeuronExecutorAsync
-            executor_class = NeuronExecutorAsync
-        elif engine_config.device_config.device_type == "cpu":
-            invalidInputError(not engine_config.parallel_config.worker_use_ray, (
-                "Ray is not supported with the CPU backend."))
-            from vllm.executor.cpu_executor import CPUExecutorAsync
-            executor_class = CPUExecutorAsync
-        elif engine_config.parallel_config.worker_use_ray:
-            initialize_ray_cluster(engine_config.parallel_config)
-            from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
-            executor_class = RayGPUExecutorAsync
-        else:
-            invalidInputError(engine_config.parallel_config.world_size == 1, (
-                "Ray is required if parallel_config.world_size > 1."))
-            from vllm.executor.gpu_executor import GPUExecutorAsync
-            executor_class = GPUExecutorAsync
-        # Create the async LLM engine.
-        engine = cls(
-            engine_config.parallel_config.worker_use_ray,
-            engine_args.engine_use_ray,
-            **engine_config.to_dict(),
-            executor_class=executor_class,
-            log_requests=not engine_args.disable_log_requests,
-            log_stats=not engine_args.disable_log_stats,
-            max_log_len=engine_args.max_log_len,
-            start_engine_loop=start_engine_loop,
-            usage_context=usage_context,
-        )
-        return engine
+        return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
+                                        start_engine_loop=start_engine_loop,
+                                        usage_context=usage_context, stat_loggers=stat_loggers)
-class IPEXLLMClass(LLM):
+class IPEXLLMAsyncV1Engine(AsyncLLM):
+    def __init__(self, *args, **kwargs):
+        print("IPEX-LLM V1 engine get started...")
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        engine_config: Optional[VllmConfig] = None,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        load_in_low_bit: str = "sym_int4",
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
+    ) -> "AsyncLLM":
+        _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
+                                        start_engine_loop=start_engine_loop,
+                                        usage_context=usage_context, stat_loggers=stat_loggers)
+class IPEXLLMClass(LLM):
     def __init__(
         self,
         model: str,
@@ -85,6 +89,7 @@ class IPEXLLMClass(LLM):
         tokenizer_mode: str = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
+        allowed_local_media_path: str = "",
         tensor_parallel_size: int = 1,
         dtype: str = "auto",
         quantization: Optional[str] = None,
@@ -92,22 +97,48 @@ class IPEXLLMClass(LLM):
         tokenizer_revision: Optional[str] = None,
         seed: int = 0,
         gpu_memory_utilization: float = 0.9,
-        swap_space: int = 4,
-        enforce_eager: bool = False,
-        max_context_len_to_capture: Optional[int] = None,
+        swap_space: float = 4,
+        cpu_offload_gb: float = 0,
+        enforce_eager: Optional[bool] = None,
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
-        load_in_low_bit: Optional[str] = None,
+        disable_async_output_proc: bool = True,
+        hf_overrides: Optional[HfOverrides] = None,
+        mm_processor_kwargs: Optional[Dict[str, Any]]=None,
+        # After positional args are removed, move this right below `model`
+        task: TaskOption = "auto",
+        override_pooler_config: Optional[PoolerConfig] = None,
+        compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
+        load_in_low_bit: str = "sym_int4",
         **kwargs,
     ) -> None:
+        '''
+        LLM constructor.
+        Note: if enforce_eager is unset (enforce_eager is None)
+        it defaults to False.
+        '''
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
+        if compilation_config is not None:
+            if isinstance(compilation_config, (int, dict)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    str(compilation_config))
+            else:
+                compilation_config_instance = compilation_config
+        else:
+            compilation_config_instance = None
         engine_args = EngineArgs(
             model=model,
+            task=task,
             tokenizer=tokenizer,
             tokenizer_mode=tokenizer_mode,
             skip_tokenizer_init=skip_tokenizer_init,
             trust_remote_code=trust_remote_code,
+            allowed_local_media_path=allowed_local_media_path,
             tensor_parallel_size=tensor_parallel_size,
             dtype=dtype,
             quantization=quantization,
@@ -116,16 +147,60 @@ class IPEXLLMClass(LLM):
             seed=seed,
             gpu_memory_utilization=gpu_memory_utilization,
             swap_space=swap_space,
+            cpu_offload_gb=cpu_offload_gb,
             enforce_eager=enforce_eager,
-            max_context_len_to_capture=max_context_len_to_capture,
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
+            disable_async_output_proc=disable_async_output_proc,
+            hf_overrides=hf_overrides,
+            mm_processor_kwargs=mm_processor_kwargs,
+            override_pooler_config=override_pooler_config,
+            compilation_config=compilation_config_instance,
             **kwargs,
         )
-        self.llm_engine = IPEXLLMLLMEngine.from_engine_args(engine_args,
-                                                            load_in_low_bit=load_in_low_bit)
+        # Logic to switch between engines is done at runtime instead of import
+        # to avoid import order issues
+        # TODO(gc): we will need to override this function
+        self.engine_class = self.get_engine_class()
+        self.llm_engine = self.engine_class.from_engine_args(
+            engine_args, usage_context=UsageContext.LLM_CLASS,
+            load_in_low_bit=load_in_low_bit)
         self.request_counter = Counter()
+    @staticmethod
+    def get_engine_class() -> Type[LLMEngine]:
+        if envs.VLLM_USE_V1:
+            # Lazy import: the v1 package isn't distributed
+            # from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+            return IPEXLLMLLMV1Engine  # type: ignore
+        return IPEXLLMLLMEngine
+# TODO(gc): implement this later...
+class IPEXLLMLLMV1Engine(V1LLMEngine):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
+        enable_multiprocessing: bool = False,
+        load_in_low_bit: str = "sym_int4",
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        # TODO(gc): delete this later
+        print("IPEXLLM V1 Engine")
+        # This does not work as it is in the seperate process...
+        _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args, usage_context,
+                                        stat_loggers, enable_multiprocessing)
 class IPEXLLMLLMEngine(LLMEngine):
     def __init__(self, *args, **kwargs):
@@ -136,35 +211,44 @@ class IPEXLLMLLMEngine(LLMEngine):
         cls,
         engine_args: EngineArgs,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        load_in_low_bit: Optional[str] = None,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
+        load_in_low_bit: str = "sym_int4",
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
-        engine_config = engine_args.create_engine_config()
-        from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
+        # TODO(gc): Delete
+        print("Use vLLM v0 engine")
         _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args, usage_context, stat_loggers)
-        # Initialize the cluster and specify the executor class.
-        if engine_config.device_config.device_type == "neuron":
-            from vllm.executor.neuron_executor import NeuronExecutor
-            executor_class = NeuronExecutor
-        elif engine_config.device_config.device_type == "cpu":
-            from vllm.executor.cpu_executor import CPUExecutor
-            executor_class = CPUExecutor
-        elif engine_config.parallel_config.worker_use_ray:
-            initialize_ray_cluster(engine_config.parallel_config)
-            from vllm.executor.ray_gpu_executor import RayGPUExecutor
-            executor_class = RayGPUExecutor
-        else:
-            invalidInputError(engine_config.parallel_config.world_size == 1, (
-                "Ray is required if parallel_config.world_size > 1."))
-            from vllm.executor.gpu_executor import GPUExecutor
-            executor_class = GPUExecutor
-        # Create the LLM engine.
-        engine = cls(**engine_config.to_dict(),
-                     executor_class=executor_class,
-                     log_stats=not engine_args.disable_log_stats,
-                     usage_context=usage_context,
-                     )
-        return engine
+class IPEXLLMMQLLMEngine(MQLLMEngine):
+    @classmethod
+    def from_engine_args(cls, engine_args: AsyncEngineArgs,
+                         usage_context: UsageContext, ipc_path: str, load_in_low_bit: str):
+        _ipex_llm_convert(load_in_low_bit)
+        return super().from_engine_args(engine_args, usage_context, ipc_path)
+def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
+                  ipc_path: str, load_in_low_bit: str, engine_alive):
+    def signal_handler(*_) -> None:
+        # Interrupt server on sigterm
+        raise KeyboardInterrupt("MQLLMEngine terminated")  # noqa
+    try:
+        signal.signal(signal.SIGTERM, signal_handler)
+        engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
+                                                     usage_context=usage_context,
+                                                     ipc_path=ipc_path,
+                                                     load_in_low_bit=load_in_low_bit)
+        engine.start()
+    except BaseException as e:
+        logger.exception(e)
+        engine_alive.value = False
+        raise e  # noqa
+if os.getenv("VLLM_USE_V1"):
+    IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine