ipex-llm 2.2.0b20250120__py3-none-win_amd64.whl → 2.2.0b20250122__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/convert.py +0 -1
  31. ipex_llm/transformers/low_bit_linear.py +8 -5
  32. ipex_llm/transformers/model.py +1 -3
  33. ipex_llm/transformers/patches.py +0 -11
  34. ipex_llm/transformers/utils.py +16 -10
  35. ipex_llm/vllm/cpu/engine/__init__.py +2 -1
  36. ipex_llm/vllm/cpu/engine/engine.py +159 -75
  37. ipex_llm/vllm/cpu/entrypoints/api_server.py +787 -0
  38. ipex_llm/vllm/cpu/entrypoints/openai/api_server.py +680 -95
  39. ipex_llm/vllm/cpu/entrypoints/openai/cli_args.py +277 -0
  40. ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py +23 -0
  41. ipex_llm/vllm/cpu/ipex_llm_wrapper.py +24 -0
  42. ipex_llm/vllm/cpu/model_convert.py +126 -233
  43. {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/METADATA +20 -20
  44. {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/RECORD +50 -46
  45. {ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/ipex-llm-init.bat +0 -0
  46. {ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/llm-chat.ps1 +0 -0
  47. {ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/llm-cli.ps1 +0 -0
  48. {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/WHEEL +0 -0
  49. {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/entry_points.txt +0 -0
  50. {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/top_level.txt +0 -0
Binary file
ipex_llm/libs/bloom.dll CHANGED
Binary file
Binary file
ipex_llm/libs/gptneox.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
ipex_llm/libs/llama.dll CHANGED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -693,7 +693,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
693
693
  out_features,
694
694
  mp_group,
695
695
  None,
696
- None,
697
696
  optimize_lm_head,
698
697
  None
699
698
  )
@@ -204,12 +204,15 @@ def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int
204
204
 
205
205
 
206
206
  def ggml_q_format_convet_xpu2cpu(tensor: torch.Tensor, num_elem: int, qtype: int):
207
-
208
- invalidInputError(tensor.dtype == torch.uint8,
209
- "Input tensor must be uint8")
207
+ if qtype == NF4:
208
+ invalidInputError(tensor.dtype == torch.bfloat16,
209
+ "NF4 Input tensor must be bfloat16")
210
+ else:
211
+ invalidInputError(tensor.dtype == torch.uint8,
212
+ "Input tensor must be uint8")
210
213
 
211
214
  invalidInputError(tensor.device == torch.device('cpu'),
212
- "Input tensor must be uint8")
215
+ "Input tensor must be on cpu")
213
216
 
214
217
  src = ctypes.c_void_p(tensor.data.data_ptr())
215
218
 
@@ -746,7 +749,7 @@ class LowBitLinear(nn.Linear):
746
749
  dist.inference_all_reduce(result, group=self.mp_group)
747
750
  if self.bias is not None:
748
751
  result += self.bias
749
- return result
752
+ return result.to(x.dtype)
750
753
 
751
754
 
752
755
  class FP16Linear(nn.Linear):
@@ -51,7 +51,7 @@ from ipex_llm.transformers.gguf.api import load_gguf_model
51
51
 
52
52
  from .utils import logger, load_state_dict
53
53
  from .utils import extract_local_archive_file, get_local_shard_files, load_imatrix_data
54
- from .patches import patch_flash_attn_import, patch_sdpa_available
54
+ from .patches import patch_flash_attn_import
55
55
 
56
56
  patched_training_mode = None
57
57
 
@@ -108,7 +108,6 @@ class _BaseAutoModelClass:
108
108
 
109
109
  @classmethod
110
110
  @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
111
- @patch("transformers.modeling_utils.is_torch_sdpa_available", patch_sdpa_available, create=True)
112
111
  def from_pretrained(cls,
113
112
  *args,
114
113
  **kwargs):
@@ -531,7 +530,6 @@ class _BaseAutoModelClass:
531
530
 
532
531
  @classmethod
533
532
  @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
534
- @patch("transformers.modeling_utils.is_torch_sdpa_available", patch_sdpa_available, create=True)
535
533
  def load_low_bit(cls,
536
534
  pretrained_model_name_or_path,
537
535
  *model_args,
@@ -26,14 +26,3 @@ def patch_flash_attn_import(filename: str) -> List[str]:
26
26
  if "flash_attn" in imports:
27
27
  imports.remove("flash_attn")
28
28
  return imports
29
-
30
-
31
- def patch_sdpa_available() -> bool:
32
- if IPEXImporter.is_xpu_version_installed():
33
- return False
34
- else:
35
- try:
36
- from transformers.utils import is_torch_sdpa_available
37
- return is_torch_sdpa_available()
38
- except ImportError:
39
- return False
@@ -139,19 +139,25 @@ def fix_key(key):
139
139
 
140
140
 
141
141
  def get_autocast_dtype(x):
142
- if x.device.type == "xpu":
143
- if torch.xpu.is_autocast_xpu_enabled():
144
- return torch.xpu.get_autocast_xpu_dtype()
145
- else:
146
- return None
147
- elif x.device.type == "cpu":
148
- if torch.is_autocast_cpu_enabled():
149
- return torch.get_autocast_cpu_dtype()
142
+ if torch.__version__ >= '2.3':
143
+ if torch.is_autocast_enabled(x.device.type):
144
+ return torch.get_autocast_dtype(x.device.type)
150
145
  else:
151
146
  return None
152
147
  else:
153
- invalidInputError(False,
154
- f"Device {x.device} is not supported.")
148
+ if x.device.type == "xpu":
149
+ if torch.xpu.is_autocast_xpu_enabled():
150
+ return torch.xpu.get_autocast_xpu_dtype()
151
+ else:
152
+ return None
153
+ elif x.device.type == "cpu":
154
+ if torch.is_autocast_cpu_enabled():
155
+ return torch.get_autocast_cpu_dtype()
156
+ else:
157
+ return None
158
+ else:
159
+ invalidInputError(False,
160
+ f"Device {x.device} is not supported.")
155
161
 
156
162
 
157
163
  def get_xpu_device_name(device: torch.device):
@@ -13,9 +13,10 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  #
16
- from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass
16
+ from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine
17
17
  __all__ = [
18
18
  "IPEXLLMAsyncLLMEngine",
19
19
  "IPEXLLMLLMEngine",
20
20
  "IPEXLLMClass",
21
+ "run_mp_engine",
21
22
  ]
@@ -13,18 +13,28 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
  #
16
-
17
- from typing import List, Optional, Union
16
+ from vllm.logger import init_logger
17
+ from typing import Dict, Optional, Any, Union, Type
18
18
  from vllm.engine.llm_engine import LLMEngine
19
19
  from vllm.engine.async_llm_engine import AsyncLLMEngine
20
20
  from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
21
21
  from vllm.entrypoints.llm import LLM
22
- from vllm.executor.ray_utils import initialize_ray_cluster
23
- from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
24
- usage_message)
25
22
  from vllm.utils import Counter
23
+ from vllm.config import VllmConfig
24
+ from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
25
+ from vllm.usage.usage_lib import UsageContext
26
+ from vllm.engine.metrics import StatLoggerBase
27
+ from vllm.engine.multiprocessing.engine import MQLLMEngine
28
+ import signal
29
+ from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
30
+ TaskOption)
31
+ from vllm.config import CompilationConfig
32
+ from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
33
+ from vllm import envs
34
+ from vllm.v1.engine.async_llm import AsyncLLM
35
+ import os
26
36
 
27
- from ipex_llm.utils.common import invalidInputError
37
+ logger = init_logger(__name__)
28
38
 
29
39
 
30
40
  class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
@@ -35,49 +45,43 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
35
45
  def from_engine_args(
36
46
  cls,
37
47
  engine_args: AsyncEngineArgs,
48
+ engine_config: Optional[VllmConfig] = None,
38
49
  start_engine_loop: bool = True,
39
50
  usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
40
- load_in_low_bit: Optional[str] = None,
51
+ load_in_low_bit: str = "sym_int4",
52
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
41
53
  ) -> "AsyncLLMEngine":
42
54
  """Creates an async LLM engine from the engine arguments."""
43
- # Enable ipex-llm optimizations
44
- engine_config = engine_args.create_engine_config()
45
- from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
55
+ # Create the engine configs.
46
56
  _ipex_llm_convert(load_in_low_bit)
47
- if engine_config.device_config.device_type == "neuron":
48
- from vllm.executor.neuron_executor import NeuronExecutorAsync
49
- executor_class = NeuronExecutorAsync
50
- elif engine_config.device_config.device_type == "cpu":
51
- invalidInputError(not engine_config.parallel_config.worker_use_ray, (
52
- "Ray is not supported with the CPU backend."))
53
- from vllm.executor.cpu_executor import CPUExecutorAsync
54
- executor_class = CPUExecutorAsync
55
- elif engine_config.parallel_config.worker_use_ray:
56
- initialize_ray_cluster(engine_config.parallel_config)
57
- from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
58
- executor_class = RayGPUExecutorAsync
59
- else:
60
- invalidInputError(engine_config.parallel_config.world_size == 1, (
61
- "Ray is required if parallel_config.world_size > 1."))
62
- from vllm.executor.gpu_executor import GPUExecutorAsync
63
- executor_class = GPUExecutorAsync
64
- # Create the async LLM engine.
65
- engine = cls(
66
- engine_config.parallel_config.worker_use_ray,
67
- engine_args.engine_use_ray,
68
- **engine_config.to_dict(),
69
- executor_class=executor_class,
70
- log_requests=not engine_args.disable_log_requests,
71
- log_stats=not engine_args.disable_log_stats,
72
- max_log_len=engine_args.max_log_len,
73
- start_engine_loop=start_engine_loop,
74
- usage_context=usage_context,
75
- )
76
- return engine
57
+ return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
58
+ start_engine_loop=start_engine_loop,
59
+ usage_context=usage_context, stat_loggers=stat_loggers)
77
60
 
78
61
 
79
- class IPEXLLMClass(LLM):
62
+ class IPEXLLMAsyncV1Engine(AsyncLLM):
80
63
 
64
+ def __init__(self, *args, **kwargs):
65
+ print("IPEX-LLM V1 engine get started...")
66
+ super().__init__(*args, **kwargs)
67
+
68
+ @classmethod
69
+ def from_engine_args(
70
+ cls,
71
+ engine_args: AsyncEngineArgs,
72
+ engine_config: Optional[VllmConfig] = None,
73
+ start_engine_loop: bool = True,
74
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
75
+ load_in_low_bit: str = "sym_int4",
76
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
77
+ ) -> "AsyncLLM":
78
+ _ipex_llm_convert(load_in_low_bit)
79
+ return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
80
+ start_engine_loop=start_engine_loop,
81
+ usage_context=usage_context, stat_loggers=stat_loggers)
82
+
83
+
84
+ class IPEXLLMClass(LLM):
81
85
  def __init__(
82
86
  self,
83
87
  model: str,
@@ -85,6 +89,7 @@ class IPEXLLMClass(LLM):
85
89
  tokenizer_mode: str = "auto",
86
90
  skip_tokenizer_init: bool = False,
87
91
  trust_remote_code: bool = False,
92
+ allowed_local_media_path: str = "",
88
93
  tensor_parallel_size: int = 1,
89
94
  dtype: str = "auto",
90
95
  quantization: Optional[str] = None,
@@ -92,22 +97,48 @@ class IPEXLLMClass(LLM):
92
97
  tokenizer_revision: Optional[str] = None,
93
98
  seed: int = 0,
94
99
  gpu_memory_utilization: float = 0.9,
95
- swap_space: int = 4,
96
- enforce_eager: bool = False,
97
- max_context_len_to_capture: Optional[int] = None,
100
+ swap_space: float = 4,
101
+ cpu_offload_gb: float = 0,
102
+ enforce_eager: Optional[bool] = None,
98
103
  max_seq_len_to_capture: int = 8192,
99
104
  disable_custom_all_reduce: bool = False,
100
- load_in_low_bit: Optional[str] = None,
105
+ disable_async_output_proc: bool = True,
106
+ hf_overrides: Optional[HfOverrides] = None,
107
+ mm_processor_kwargs: Optional[Dict[str, Any]]=None,
108
+ # After positional args are removed, move this right below `model`
109
+ task: TaskOption = "auto",
110
+ override_pooler_config: Optional[PoolerConfig] = None,
111
+ compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
112
+ load_in_low_bit: str = "sym_int4",
101
113
  **kwargs,
102
114
  ) -> None:
115
+ '''
116
+ LLM constructor.
117
+
118
+ Note: if enforce_eager is unset (enforce_eager is None)
119
+ it defaults to False.
120
+ '''
121
+
103
122
  if "disable_log_stats" not in kwargs:
104
123
  kwargs["disable_log_stats"] = True
124
+
125
+ if compilation_config is not None:
126
+ if isinstance(compilation_config, (int, dict)):
127
+ compilation_config_instance = CompilationConfig.from_cli(
128
+ str(compilation_config))
129
+ else:
130
+ compilation_config_instance = compilation_config
131
+ else:
132
+ compilation_config_instance = None
133
+
105
134
  engine_args = EngineArgs(
106
135
  model=model,
136
+ task=task,
107
137
  tokenizer=tokenizer,
108
138
  tokenizer_mode=tokenizer_mode,
109
139
  skip_tokenizer_init=skip_tokenizer_init,
110
140
  trust_remote_code=trust_remote_code,
141
+ allowed_local_media_path=allowed_local_media_path,
111
142
  tensor_parallel_size=tensor_parallel_size,
112
143
  dtype=dtype,
113
144
  quantization=quantization,
@@ -116,16 +147,60 @@ class IPEXLLMClass(LLM):
116
147
  seed=seed,
117
148
  gpu_memory_utilization=gpu_memory_utilization,
118
149
  swap_space=swap_space,
150
+ cpu_offload_gb=cpu_offload_gb,
119
151
  enforce_eager=enforce_eager,
120
- max_context_len_to_capture=max_context_len_to_capture,
121
152
  max_seq_len_to_capture=max_seq_len_to_capture,
122
153
  disable_custom_all_reduce=disable_custom_all_reduce,
154
+ disable_async_output_proc=disable_async_output_proc,
155
+ hf_overrides=hf_overrides,
156
+ mm_processor_kwargs=mm_processor_kwargs,
157
+ override_pooler_config=override_pooler_config,
158
+ compilation_config=compilation_config_instance,
123
159
  **kwargs,
124
160
  )
125
- self.llm_engine = IPEXLLMLLMEngine.from_engine_args(engine_args,
126
- load_in_low_bit=load_in_low_bit)
161
+ # Logic to switch between engines is done at runtime instead of import
162
+ # to avoid import order issues
163
+ # TODO(gc): we will need to override this function
164
+ self.engine_class = self.get_engine_class()
165
+ self.llm_engine = self.engine_class.from_engine_args(
166
+ engine_args, usage_context=UsageContext.LLM_CLASS,
167
+ load_in_low_bit=load_in_low_bit)
168
+
127
169
  self.request_counter = Counter()
128
170
 
171
+ @staticmethod
172
+ def get_engine_class() -> Type[LLMEngine]:
173
+ if envs.VLLM_USE_V1:
174
+ # Lazy import: the v1 package isn't distributed
175
+ # from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
176
+ return IPEXLLMLLMV1Engine # type: ignore
177
+ return IPEXLLMLLMEngine
178
+
179
+
180
+ # TODO(gc): implement this later...
181
+ class IPEXLLMLLMV1Engine(V1LLMEngine):
182
+ def __init__(self, *args, **kwargs):
183
+ super().__init__(*args, **kwargs)
184
+
185
+ @classmethod
186
+ def from_engine_args(
187
+ cls,
188
+ engine_args: EngineArgs,
189
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
190
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
191
+ enable_multiprocessing: bool = False,
192
+ load_in_low_bit: str = "sym_int4",
193
+ ) -> "LLMEngine":
194
+ """Creates an LLM engine from the engine arguments."""
195
+ # Create the engine configs.
196
+
197
+ # TODO(gc): delete this later
198
+ print("IPEXLLM V1 Engine")
199
+ # This does not work as it is in the seperate process...
200
+ _ipex_llm_convert(load_in_low_bit)
201
+ return super().from_engine_args(engine_args, usage_context,
202
+ stat_loggers, enable_multiprocessing)
203
+
129
204
 
130
205
  class IPEXLLMLLMEngine(LLMEngine):
131
206
  def __init__(self, *args, **kwargs):
@@ -136,35 +211,44 @@ class IPEXLLMLLMEngine(LLMEngine):
136
211
  cls,
137
212
  engine_args: EngineArgs,
138
213
  usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
139
- load_in_low_bit: Optional[str] = None,
214
+ stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
215
+ load_in_low_bit: str = "sym_int4",
140
216
  ) -> "LLMEngine":
141
217
  """Creates an LLM engine from the engine arguments."""
142
218
  # Create the engine configs.
143
- engine_config = engine_args.create_engine_config()
144
- from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
219
+ # TODO(gc): Delete
220
+ print("Use vLLM v0 engine")
145
221
  _ipex_llm_convert(load_in_low_bit)
222
+ return super().from_engine_args(engine_args, usage_context, stat_loggers)
146
223
 
147
- # Initialize the cluster and specify the executor class.
148
- if engine_config.device_config.device_type == "neuron":
149
- from vllm.executor.neuron_executor import NeuronExecutor
150
- executor_class = NeuronExecutor
151
- elif engine_config.device_config.device_type == "cpu":
152
- from vllm.executor.cpu_executor import CPUExecutor
153
- executor_class = CPUExecutor
154
- elif engine_config.parallel_config.worker_use_ray:
155
- initialize_ray_cluster(engine_config.parallel_config)
156
- from vllm.executor.ray_gpu_executor import RayGPUExecutor
157
- executor_class = RayGPUExecutor
158
- else:
159
- invalidInputError(engine_config.parallel_config.world_size == 1, (
160
- "Ray is required if parallel_config.world_size > 1."))
161
- from vllm.executor.gpu_executor import GPUExecutor
162
- executor_class = GPUExecutor
163
-
164
- # Create the LLM engine.
165
- engine = cls(**engine_config.to_dict(),
166
- executor_class=executor_class,
167
- log_stats=not engine_args.disable_log_stats,
168
- usage_context=usage_context,
169
- )
170
- return engine
224
+
225
+ class IPEXLLMMQLLMEngine(MQLLMEngine):
226
+ @classmethod
227
+ def from_engine_args(cls, engine_args: AsyncEngineArgs,
228
+ usage_context: UsageContext, ipc_path: str, load_in_low_bit: str):
229
+ _ipex_llm_convert(load_in_low_bit)
230
+ return super().from_engine_args(engine_args, usage_context, ipc_path)
231
+
232
+
233
+ def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
234
+ ipc_path: str, load_in_low_bit: str, engine_alive):
235
+
236
+ def signal_handler(*_) -> None:
237
+ # Interrupt server on sigterm
238
+ raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
239
+
240
+ try:
241
+ signal.signal(signal.SIGTERM, signal_handler)
242
+
243
+ engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
244
+ usage_context=usage_context,
245
+ ipc_path=ipc_path,
246
+ load_in_low_bit=load_in_low_bit)
247
+ engine.start()
248
+ except BaseException as e:
249
+ logger.exception(e)
250
+ engine_alive.value = False
251
+ raise e # noqa
252
+
253
+ if os.getenv("VLLM_USE_V1"):
254
+ IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine