ipex-llm 2.2.0b20250120__py3-none-win_amd64.whl → 2.2.0b20250122__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +0 -1
- ipex_llm/transformers/low_bit_linear.py +8 -5
- ipex_llm/transformers/model.py +1 -3
- ipex_llm/transformers/patches.py +0 -11
- ipex_llm/transformers/utils.py +16 -10
- ipex_llm/vllm/cpu/engine/__init__.py +2 -1
- ipex_llm/vllm/cpu/engine/engine.py +159 -75
- ipex_llm/vllm/cpu/entrypoints/api_server.py +787 -0
- ipex_llm/vllm/cpu/entrypoints/openai/api_server.py +680 -95
- ipex_llm/vllm/cpu/entrypoints/openai/cli_args.py +277 -0
- ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py +23 -0
- ipex_llm/vllm/cpu/ipex_llm_wrapper.py +24 -0
- ipex_llm/vllm/cpu/model_convert.py +126 -233
- {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/METADATA +20 -20
- {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/RECORD +50 -46
- {ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250120.data → ipex_llm-2.2.0b20250122.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250120.dist-info → ipex_llm-2.2.0b20250122.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
ipex_llm/transformers/convert.py
CHANGED
@@ -204,12 +204,15 @@ def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int
|
|
204
204
|
|
205
205
|
|
206
206
|
def ggml_q_format_convet_xpu2cpu(tensor: torch.Tensor, num_elem: int, qtype: int):
|
207
|
-
|
208
|
-
|
209
|
-
|
207
|
+
if qtype == NF4:
|
208
|
+
invalidInputError(tensor.dtype == torch.bfloat16,
|
209
|
+
"NF4 Input tensor must be bfloat16")
|
210
|
+
else:
|
211
|
+
invalidInputError(tensor.dtype == torch.uint8,
|
212
|
+
"Input tensor must be uint8")
|
210
213
|
|
211
214
|
invalidInputError(tensor.device == torch.device('cpu'),
|
212
|
-
"Input tensor must be
|
215
|
+
"Input tensor must be on cpu")
|
213
216
|
|
214
217
|
src = ctypes.c_void_p(tensor.data.data_ptr())
|
215
218
|
|
@@ -746,7 +749,7 @@ class LowBitLinear(nn.Linear):
|
|
746
749
|
dist.inference_all_reduce(result, group=self.mp_group)
|
747
750
|
if self.bias is not None:
|
748
751
|
result += self.bias
|
749
|
-
return result
|
752
|
+
return result.to(x.dtype)
|
750
753
|
|
751
754
|
|
752
755
|
class FP16Linear(nn.Linear):
|
ipex_llm/transformers/model.py
CHANGED
@@ -51,7 +51,7 @@ from ipex_llm.transformers.gguf.api import load_gguf_model
|
|
51
51
|
|
52
52
|
from .utils import logger, load_state_dict
|
53
53
|
from .utils import extract_local_archive_file, get_local_shard_files, load_imatrix_data
|
54
|
-
from .patches import patch_flash_attn_import
|
54
|
+
from .patches import patch_flash_attn_import
|
55
55
|
|
56
56
|
patched_training_mode = None
|
57
57
|
|
@@ -108,7 +108,6 @@ class _BaseAutoModelClass:
|
|
108
108
|
|
109
109
|
@classmethod
|
110
110
|
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
|
111
|
-
@patch("transformers.modeling_utils.is_torch_sdpa_available", patch_sdpa_available, create=True)
|
112
111
|
def from_pretrained(cls,
|
113
112
|
*args,
|
114
113
|
**kwargs):
|
@@ -531,7 +530,6 @@ class _BaseAutoModelClass:
|
|
531
530
|
|
532
531
|
@classmethod
|
533
532
|
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
|
534
|
-
@patch("transformers.modeling_utils.is_torch_sdpa_available", patch_sdpa_available, create=True)
|
535
533
|
def load_low_bit(cls,
|
536
534
|
pretrained_model_name_or_path,
|
537
535
|
*model_args,
|
ipex_llm/transformers/patches.py
CHANGED
@@ -26,14 +26,3 @@ def patch_flash_attn_import(filename: str) -> List[str]:
|
|
26
26
|
if "flash_attn" in imports:
|
27
27
|
imports.remove("flash_attn")
|
28
28
|
return imports
|
29
|
-
|
30
|
-
|
31
|
-
def patch_sdpa_available() -> bool:
|
32
|
-
if IPEXImporter.is_xpu_version_installed():
|
33
|
-
return False
|
34
|
-
else:
|
35
|
-
try:
|
36
|
-
from transformers.utils import is_torch_sdpa_available
|
37
|
-
return is_torch_sdpa_available()
|
38
|
-
except ImportError:
|
39
|
-
return False
|
ipex_llm/transformers/utils.py
CHANGED
@@ -139,19 +139,25 @@ def fix_key(key):
|
|
139
139
|
|
140
140
|
|
141
141
|
def get_autocast_dtype(x):
|
142
|
-
if
|
143
|
-
if torch.
|
144
|
-
return torch.
|
145
|
-
else:
|
146
|
-
return None
|
147
|
-
elif x.device.type == "cpu":
|
148
|
-
if torch.is_autocast_cpu_enabled():
|
149
|
-
return torch.get_autocast_cpu_dtype()
|
142
|
+
if torch.__version__ >= '2.3':
|
143
|
+
if torch.is_autocast_enabled(x.device.type):
|
144
|
+
return torch.get_autocast_dtype(x.device.type)
|
150
145
|
else:
|
151
146
|
return None
|
152
147
|
else:
|
153
|
-
|
154
|
-
|
148
|
+
if x.device.type == "xpu":
|
149
|
+
if torch.xpu.is_autocast_xpu_enabled():
|
150
|
+
return torch.xpu.get_autocast_xpu_dtype()
|
151
|
+
else:
|
152
|
+
return None
|
153
|
+
elif x.device.type == "cpu":
|
154
|
+
if torch.is_autocast_cpu_enabled():
|
155
|
+
return torch.get_autocast_cpu_dtype()
|
156
|
+
else:
|
157
|
+
return None
|
158
|
+
else:
|
159
|
+
invalidInputError(False,
|
160
|
+
f"Device {x.device} is not supported.")
|
155
161
|
|
156
162
|
|
157
163
|
def get_xpu_device_name(device: torch.device):
|
@@ -13,9 +13,10 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
#
|
16
|
-
from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass
|
16
|
+
from .engine import IPEXLLMAsyncLLMEngine, IPEXLLMLLMEngine, IPEXLLMClass, run_mp_engine
|
17
17
|
__all__ = [
|
18
18
|
"IPEXLLMAsyncLLMEngine",
|
19
19
|
"IPEXLLMLLMEngine",
|
20
20
|
"IPEXLLMClass",
|
21
|
+
"run_mp_engine",
|
21
22
|
]
|
@@ -13,18 +13,28 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
#
|
16
|
-
|
17
|
-
from typing import
|
16
|
+
from vllm.logger import init_logger
|
17
|
+
from typing import Dict, Optional, Any, Union, Type
|
18
18
|
from vllm.engine.llm_engine import LLMEngine
|
19
19
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
20
20
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
21
21
|
from vllm.entrypoints.llm import LLM
|
22
|
-
from vllm.executor.ray_utils import initialize_ray_cluster
|
23
|
-
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
|
24
|
-
usage_message)
|
25
22
|
from vllm.utils import Counter
|
23
|
+
from vllm.config import VllmConfig
|
24
|
+
from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
|
25
|
+
from vllm.usage.usage_lib import UsageContext
|
26
|
+
from vllm.engine.metrics import StatLoggerBase
|
27
|
+
from vllm.engine.multiprocessing.engine import MQLLMEngine
|
28
|
+
import signal
|
29
|
+
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
|
30
|
+
TaskOption)
|
31
|
+
from vllm.config import CompilationConfig
|
32
|
+
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
33
|
+
from vllm import envs
|
34
|
+
from vllm.v1.engine.async_llm import AsyncLLM
|
35
|
+
import os
|
26
36
|
|
27
|
-
|
37
|
+
logger = init_logger(__name__)
|
28
38
|
|
29
39
|
|
30
40
|
class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
@@ -35,49 +45,43 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
|
35
45
|
def from_engine_args(
|
36
46
|
cls,
|
37
47
|
engine_args: AsyncEngineArgs,
|
48
|
+
engine_config: Optional[VllmConfig] = None,
|
38
49
|
start_engine_loop: bool = True,
|
39
50
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
40
|
-
load_in_low_bit:
|
51
|
+
load_in_low_bit: str = "sym_int4",
|
52
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
41
53
|
) -> "AsyncLLMEngine":
|
42
54
|
"""Creates an async LLM engine from the engine arguments."""
|
43
|
-
#
|
44
|
-
engine_config = engine_args.create_engine_config()
|
45
|
-
from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
|
55
|
+
# Create the engine configs.
|
46
56
|
_ipex_llm_convert(load_in_low_bit)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
elif engine_config.device_config.device_type == "cpu":
|
51
|
-
invalidInputError(not engine_config.parallel_config.worker_use_ray, (
|
52
|
-
"Ray is not supported with the CPU backend."))
|
53
|
-
from vllm.executor.cpu_executor import CPUExecutorAsync
|
54
|
-
executor_class = CPUExecutorAsync
|
55
|
-
elif engine_config.parallel_config.worker_use_ray:
|
56
|
-
initialize_ray_cluster(engine_config.parallel_config)
|
57
|
-
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
|
58
|
-
executor_class = RayGPUExecutorAsync
|
59
|
-
else:
|
60
|
-
invalidInputError(engine_config.parallel_config.world_size == 1, (
|
61
|
-
"Ray is required if parallel_config.world_size > 1."))
|
62
|
-
from vllm.executor.gpu_executor import GPUExecutorAsync
|
63
|
-
executor_class = GPUExecutorAsync
|
64
|
-
# Create the async LLM engine.
|
65
|
-
engine = cls(
|
66
|
-
engine_config.parallel_config.worker_use_ray,
|
67
|
-
engine_args.engine_use_ray,
|
68
|
-
**engine_config.to_dict(),
|
69
|
-
executor_class=executor_class,
|
70
|
-
log_requests=not engine_args.disable_log_requests,
|
71
|
-
log_stats=not engine_args.disable_log_stats,
|
72
|
-
max_log_len=engine_args.max_log_len,
|
73
|
-
start_engine_loop=start_engine_loop,
|
74
|
-
usage_context=usage_context,
|
75
|
-
)
|
76
|
-
return engine
|
57
|
+
return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
|
58
|
+
start_engine_loop=start_engine_loop,
|
59
|
+
usage_context=usage_context, stat_loggers=stat_loggers)
|
77
60
|
|
78
61
|
|
79
|
-
class
|
62
|
+
class IPEXLLMAsyncV1Engine(AsyncLLM):
|
80
63
|
|
64
|
+
def __init__(self, *args, **kwargs):
|
65
|
+
print("IPEX-LLM V1 engine get started...")
|
66
|
+
super().__init__(*args, **kwargs)
|
67
|
+
|
68
|
+
@classmethod
|
69
|
+
def from_engine_args(
|
70
|
+
cls,
|
71
|
+
engine_args: AsyncEngineArgs,
|
72
|
+
engine_config: Optional[VllmConfig] = None,
|
73
|
+
start_engine_loop: bool = True,
|
74
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
75
|
+
load_in_low_bit: str = "sym_int4",
|
76
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
77
|
+
) -> "AsyncLLM":
|
78
|
+
_ipex_llm_convert(load_in_low_bit)
|
79
|
+
return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
|
80
|
+
start_engine_loop=start_engine_loop,
|
81
|
+
usage_context=usage_context, stat_loggers=stat_loggers)
|
82
|
+
|
83
|
+
|
84
|
+
class IPEXLLMClass(LLM):
|
81
85
|
def __init__(
|
82
86
|
self,
|
83
87
|
model: str,
|
@@ -85,6 +89,7 @@ class IPEXLLMClass(LLM):
|
|
85
89
|
tokenizer_mode: str = "auto",
|
86
90
|
skip_tokenizer_init: bool = False,
|
87
91
|
trust_remote_code: bool = False,
|
92
|
+
allowed_local_media_path: str = "",
|
88
93
|
tensor_parallel_size: int = 1,
|
89
94
|
dtype: str = "auto",
|
90
95
|
quantization: Optional[str] = None,
|
@@ -92,22 +97,48 @@ class IPEXLLMClass(LLM):
|
|
92
97
|
tokenizer_revision: Optional[str] = None,
|
93
98
|
seed: int = 0,
|
94
99
|
gpu_memory_utilization: float = 0.9,
|
95
|
-
swap_space:
|
96
|
-
|
97
|
-
|
100
|
+
swap_space: float = 4,
|
101
|
+
cpu_offload_gb: float = 0,
|
102
|
+
enforce_eager: Optional[bool] = None,
|
98
103
|
max_seq_len_to_capture: int = 8192,
|
99
104
|
disable_custom_all_reduce: bool = False,
|
100
|
-
|
105
|
+
disable_async_output_proc: bool = True,
|
106
|
+
hf_overrides: Optional[HfOverrides] = None,
|
107
|
+
mm_processor_kwargs: Optional[Dict[str, Any]]=None,
|
108
|
+
# After positional args are removed, move this right below `model`
|
109
|
+
task: TaskOption = "auto",
|
110
|
+
override_pooler_config: Optional[PoolerConfig] = None,
|
111
|
+
compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
|
112
|
+
load_in_low_bit: str = "sym_int4",
|
101
113
|
**kwargs,
|
102
114
|
) -> None:
|
115
|
+
'''
|
116
|
+
LLM constructor.
|
117
|
+
|
118
|
+
Note: if enforce_eager is unset (enforce_eager is None)
|
119
|
+
it defaults to False.
|
120
|
+
'''
|
121
|
+
|
103
122
|
if "disable_log_stats" not in kwargs:
|
104
123
|
kwargs["disable_log_stats"] = True
|
124
|
+
|
125
|
+
if compilation_config is not None:
|
126
|
+
if isinstance(compilation_config, (int, dict)):
|
127
|
+
compilation_config_instance = CompilationConfig.from_cli(
|
128
|
+
str(compilation_config))
|
129
|
+
else:
|
130
|
+
compilation_config_instance = compilation_config
|
131
|
+
else:
|
132
|
+
compilation_config_instance = None
|
133
|
+
|
105
134
|
engine_args = EngineArgs(
|
106
135
|
model=model,
|
136
|
+
task=task,
|
107
137
|
tokenizer=tokenizer,
|
108
138
|
tokenizer_mode=tokenizer_mode,
|
109
139
|
skip_tokenizer_init=skip_tokenizer_init,
|
110
140
|
trust_remote_code=trust_remote_code,
|
141
|
+
allowed_local_media_path=allowed_local_media_path,
|
111
142
|
tensor_parallel_size=tensor_parallel_size,
|
112
143
|
dtype=dtype,
|
113
144
|
quantization=quantization,
|
@@ -116,16 +147,60 @@ class IPEXLLMClass(LLM):
|
|
116
147
|
seed=seed,
|
117
148
|
gpu_memory_utilization=gpu_memory_utilization,
|
118
149
|
swap_space=swap_space,
|
150
|
+
cpu_offload_gb=cpu_offload_gb,
|
119
151
|
enforce_eager=enforce_eager,
|
120
|
-
max_context_len_to_capture=max_context_len_to_capture,
|
121
152
|
max_seq_len_to_capture=max_seq_len_to_capture,
|
122
153
|
disable_custom_all_reduce=disable_custom_all_reduce,
|
154
|
+
disable_async_output_proc=disable_async_output_proc,
|
155
|
+
hf_overrides=hf_overrides,
|
156
|
+
mm_processor_kwargs=mm_processor_kwargs,
|
157
|
+
override_pooler_config=override_pooler_config,
|
158
|
+
compilation_config=compilation_config_instance,
|
123
159
|
**kwargs,
|
124
160
|
)
|
125
|
-
|
126
|
-
|
161
|
+
# Logic to switch between engines is done at runtime instead of import
|
162
|
+
# to avoid import order issues
|
163
|
+
# TODO(gc): we will need to override this function
|
164
|
+
self.engine_class = self.get_engine_class()
|
165
|
+
self.llm_engine = self.engine_class.from_engine_args(
|
166
|
+
engine_args, usage_context=UsageContext.LLM_CLASS,
|
167
|
+
load_in_low_bit=load_in_low_bit)
|
168
|
+
|
127
169
|
self.request_counter = Counter()
|
128
170
|
|
171
|
+
@staticmethod
|
172
|
+
def get_engine_class() -> Type[LLMEngine]:
|
173
|
+
if envs.VLLM_USE_V1:
|
174
|
+
# Lazy import: the v1 package isn't distributed
|
175
|
+
# from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
176
|
+
return IPEXLLMLLMV1Engine # type: ignore
|
177
|
+
return IPEXLLMLLMEngine
|
178
|
+
|
179
|
+
|
180
|
+
# TODO(gc): implement this later...
|
181
|
+
class IPEXLLMLLMV1Engine(V1LLMEngine):
|
182
|
+
def __init__(self, *args, **kwargs):
|
183
|
+
super().__init__(*args, **kwargs)
|
184
|
+
|
185
|
+
@classmethod
|
186
|
+
def from_engine_args(
|
187
|
+
cls,
|
188
|
+
engine_args: EngineArgs,
|
189
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
190
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
191
|
+
enable_multiprocessing: bool = False,
|
192
|
+
load_in_low_bit: str = "sym_int4",
|
193
|
+
) -> "LLMEngine":
|
194
|
+
"""Creates an LLM engine from the engine arguments."""
|
195
|
+
# Create the engine configs.
|
196
|
+
|
197
|
+
# TODO(gc): delete this later
|
198
|
+
print("IPEXLLM V1 Engine")
|
199
|
+
# This does not work as it is in the seperate process...
|
200
|
+
_ipex_llm_convert(load_in_low_bit)
|
201
|
+
return super().from_engine_args(engine_args, usage_context,
|
202
|
+
stat_loggers, enable_multiprocessing)
|
203
|
+
|
129
204
|
|
130
205
|
class IPEXLLMLLMEngine(LLMEngine):
|
131
206
|
def __init__(self, *args, **kwargs):
|
@@ -136,35 +211,44 @@ class IPEXLLMLLMEngine(LLMEngine):
|
|
136
211
|
cls,
|
137
212
|
engine_args: EngineArgs,
|
138
213
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
139
|
-
|
214
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
215
|
+
load_in_low_bit: str = "sym_int4",
|
140
216
|
) -> "LLMEngine":
|
141
217
|
"""Creates an LLM engine from the engine arguments."""
|
142
218
|
# Create the engine configs.
|
143
|
-
|
144
|
-
|
219
|
+
# TODO(gc): Delete
|
220
|
+
print("Use vLLM v0 engine")
|
145
221
|
_ipex_llm_convert(load_in_low_bit)
|
222
|
+
return super().from_engine_args(engine_args, usage_context, stat_loggers)
|
146
223
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
224
|
+
|
225
|
+
class IPEXLLMMQLLMEngine(MQLLMEngine):
|
226
|
+
@classmethod
|
227
|
+
def from_engine_args(cls, engine_args: AsyncEngineArgs,
|
228
|
+
usage_context: UsageContext, ipc_path: str, load_in_low_bit: str):
|
229
|
+
_ipex_llm_convert(load_in_low_bit)
|
230
|
+
return super().from_engine_args(engine_args, usage_context, ipc_path)
|
231
|
+
|
232
|
+
|
233
|
+
def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
|
234
|
+
ipc_path: str, load_in_low_bit: str, engine_alive):
|
235
|
+
|
236
|
+
def signal_handler(*_) -> None:
|
237
|
+
# Interrupt server on sigterm
|
238
|
+
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
|
239
|
+
|
240
|
+
try:
|
241
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
242
|
+
|
243
|
+
engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
|
244
|
+
usage_context=usage_context,
|
245
|
+
ipc_path=ipc_path,
|
246
|
+
load_in_low_bit=load_in_low_bit)
|
247
|
+
engine.start()
|
248
|
+
except BaseException as e:
|
249
|
+
logger.exception(e)
|
250
|
+
engine_alive.value = False
|
251
|
+
raise e # noqa
|
252
|
+
|
253
|
+
if os.getenv("VLLM_USE_V1"):
|
254
|
+
IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine
|