ipex-llm 2.2.0b20250211__py3-none-win_amd64.whl → 2.2.0b20250212__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +4 -3
- ipex_llm/transformers/models/janus.py +49 -0
- ipex_llm/transformers/models/utils.py +1 -1
- ipex_llm/vllm/xpu/engine/engine.py +117 -20
- ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
- ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
- ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
- ipex_llm/vllm/xpu/model_convert.py +25 -19
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/METADATA +19 -19
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/RECORD +45 -43
- {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250211.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250211.dist-info → ipex_llm-2.2.0b20250212.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
ipex_llm/transformers/convert.py
CHANGED
@@ -667,7 +667,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
|
|
667
667
|
out_features,
|
668
668
|
mp_group,
|
669
669
|
None,
|
670
|
-
None,
|
671
670
|
optimize_lm_head,
|
672
671
|
None
|
673
672
|
)
|
@@ -1066,7 +1065,7 @@ def _optimize_pre(model, qtype=None):
|
|
1066
1065
|
from ipex_llm.transformers.models.baichuan_m1 import pre_register_inv_freq
|
1067
1066
|
model.apply(pre_register_inv_freq)
|
1068
1067
|
elif model.config.model_type == "multi_modality":
|
1069
|
-
|
1068
|
+
_optimize_pre(model.language_model)
|
1070
1069
|
|
1071
1070
|
return model
|
1072
1071
|
|
@@ -2012,8 +2011,10 @@ def _optimize_post(model):
|
|
2012
2011
|
# vision
|
2013
2012
|
vpm_modeling_module_name = model.vision_model.vision_tower.__class__.__module__
|
2014
2013
|
vpm_module = importlib.import_module(vpm_modeling_module_name)
|
2015
|
-
|
2016
2014
|
from ipex_llm.transformers.models.janus import vision_attention_forward
|
2017
2015
|
convert_forward(model.vision_model, vpm_module.Attention, vision_attention_forward)
|
2018
2016
|
|
2017
|
+
# llm
|
2018
|
+
_optimize_post(model.language_model)
|
2019
|
+
|
2019
2020
|
return model
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2016 The BigDL Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
# This file is adapted from
|
17
|
+
# https://github.com/deepseek-ai/Janus/blob/main/janus/models/siglip_vit.py
|
18
|
+
|
19
|
+
import torch
|
20
|
+
|
21
|
+
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
22
|
+
|
23
|
+
|
24
|
+
def vision_attention_forward(self, x: torch.Tensor) -> torch.Tensor:
|
25
|
+
B, N, C = x.shape
|
26
|
+
qkv = (
|
27
|
+
self.qkv(x)
|
28
|
+
.reshape(B, N, 3, self.num_heads, self.head_dim)
|
29
|
+
.permute(2, 0, 3, 1, 4)
|
30
|
+
)
|
31
|
+
q, k, v = qkv.unbind(0)
|
32
|
+
q, k = self.q_norm(q), self.k_norm(k)
|
33
|
+
|
34
|
+
if self.fused_attn:
|
35
|
+
# ipex-llm opt: sdpa
|
36
|
+
x = scaled_dot_product_attention(
|
37
|
+
q, k.contiguous(), v.contiguous(), None, False
|
38
|
+
)
|
39
|
+
else:
|
40
|
+
q = q * self.scale
|
41
|
+
attn = q @ k.transpose(-2, -1)
|
42
|
+
attn = attn.softmax(dim=-1)
|
43
|
+
attn = self.attn_drop(attn)
|
44
|
+
x = attn @ v
|
45
|
+
|
46
|
+
x = x.transpose(1, 2).reshape(B, N, C)
|
47
|
+
x = self.proj(x)
|
48
|
+
x = self.proj_drop(x)
|
49
|
+
return x
|
@@ -86,7 +86,7 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
|
|
86
86
|
return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
|
87
87
|
elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
|
88
88
|
return os.environ["IPEX_LLM_LOW_MEM"] == "1"
|
89
|
-
elif linear.
|
89
|
+
elif linear.weight.dtype != torch.uint8: # unquantized
|
90
90
|
return False
|
91
91
|
else:
|
92
92
|
device_name = get_xpu_device_name(x.device)
|
@@ -13,18 +13,28 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
#
|
16
|
-
from
|
16
|
+
from vllm.logger import init_logger
|
17
|
+
from typing import Dict, Optional, Any, Union, Type
|
17
18
|
from vllm.engine.llm_engine import LLMEngine
|
18
19
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
19
20
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
20
21
|
from vllm.entrypoints.llm import LLM
|
21
22
|
from vllm.utils import Counter
|
22
|
-
from vllm.config import
|
23
|
+
from vllm.config import VllmConfig
|
23
24
|
from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
|
24
25
|
from vllm.usage.usage_lib import UsageContext
|
25
26
|
from vllm.engine.metrics import StatLoggerBase
|
26
27
|
from vllm.engine.multiprocessing.engine import MQLLMEngine
|
27
28
|
import signal
|
29
|
+
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
|
30
|
+
TaskOption)
|
31
|
+
from vllm.config import CompilationConfig
|
32
|
+
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
33
|
+
from vllm import envs
|
34
|
+
from vllm.v1.engine.async_llm import AsyncLLM
|
35
|
+
import os
|
36
|
+
|
37
|
+
logger = init_logger(__name__)
|
28
38
|
|
29
39
|
|
30
40
|
class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
@@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
|
35
45
|
def from_engine_args(
|
36
46
|
cls,
|
37
47
|
engine_args: AsyncEngineArgs,
|
38
|
-
engine_config: Optional[
|
48
|
+
engine_config: Optional[VllmConfig] = None,
|
39
49
|
start_engine_loop: bool = True,
|
40
50
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
41
51
|
load_in_low_bit: str = "sym_int4",
|
@@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
|
49
59
|
usage_context=usage_context, stat_loggers=stat_loggers)
|
50
60
|
|
51
61
|
|
62
|
+
class IPEXLLMAsyncV1Engine(AsyncLLM):
|
63
|
+
|
64
|
+
def __init__(self, *args, **kwargs):
|
65
|
+
super().__init__(*args, **kwargs)
|
66
|
+
|
67
|
+
@classmethod
|
68
|
+
def from_engine_args(
|
69
|
+
cls,
|
70
|
+
engine_args: AsyncEngineArgs,
|
71
|
+
engine_config: Optional[VllmConfig] = None,
|
72
|
+
start_engine_loop: bool = True,
|
73
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
74
|
+
load_in_low_bit: str = "sym_int4",
|
75
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa
|
76
|
+
) -> "AsyncLLM":
|
77
|
+
_ipex_llm_convert(load_in_low_bit)
|
78
|
+
return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
|
79
|
+
start_engine_loop=start_engine_loop,
|
80
|
+
usage_context=usage_context, stat_loggers=stat_loggers)
|
81
|
+
|
82
|
+
|
52
83
|
class IPEXLLMClass(LLM):
|
53
84
|
def __init__(
|
54
85
|
self,
|
@@ -57,6 +88,7 @@ class IPEXLLMClass(LLM):
|
|
57
88
|
tokenizer_mode: str = "auto",
|
58
89
|
skip_tokenizer_init: bool = False,
|
59
90
|
trust_remote_code: bool = False,
|
91
|
+
allowed_local_media_path: str = "",
|
60
92
|
tensor_parallel_size: int = 1,
|
61
93
|
dtype: str = "auto",
|
62
94
|
quantization: Optional[str] = None,
|
@@ -64,28 +96,48 @@ class IPEXLLMClass(LLM):
|
|
64
96
|
tokenizer_revision: Optional[str] = None,
|
65
97
|
seed: int = 0,
|
66
98
|
gpu_memory_utilization: float = 0.9,
|
67
|
-
swap_space:
|
99
|
+
swap_space: float = 4,
|
68
100
|
cpu_offload_gb: float = 0,
|
69
|
-
enforce_eager: bool =
|
70
|
-
max_context_len_to_capture: Optional[int] = None,
|
101
|
+
enforce_eager: Optional[bool] = None,
|
71
102
|
max_seq_len_to_capture: int = 8192,
|
72
103
|
disable_custom_all_reduce: bool = False,
|
104
|
+
disable_async_output_proc: bool = True,
|
105
|
+
hf_overrides: Optional[HfOverrides] = None,
|
106
|
+
mm_processor_kwargs: Optional[Dict[str, Any]]=None,
|
107
|
+
# After positional args are removed, move this right below `model`
|
108
|
+
task: TaskOption = "auto",
|
109
|
+
override_pooler_config: Optional[PoolerConfig] = None,
|
110
|
+
compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
|
73
111
|
load_in_low_bit: str = "sym_int4",
|
74
112
|
**kwargs,
|
75
113
|
) -> None:
|
114
|
+
'''
|
115
|
+
LLM constructor.
|
116
|
+
|
117
|
+
Note: if enforce_eager is unset (enforce_eager is None)
|
118
|
+
it defaults to False.
|
119
|
+
'''
|
120
|
+
|
76
121
|
if "disable_log_stats" not in kwargs:
|
77
122
|
kwargs["disable_log_stats"] = True
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
123
|
+
|
124
|
+
if compilation_config is not None:
|
125
|
+
if isinstance(compilation_config, (int, dict)):
|
126
|
+
compilation_config_instance = CompilationConfig.from_cli(
|
127
|
+
str(compilation_config))
|
128
|
+
else:
|
129
|
+
compilation_config_instance = compilation_config
|
130
|
+
else:
|
131
|
+
compilation_config_instance = None
|
132
|
+
|
83
133
|
engine_args = EngineArgs(
|
84
134
|
model=model,
|
135
|
+
task=task,
|
85
136
|
tokenizer=tokenizer,
|
86
137
|
tokenizer_mode=tokenizer_mode,
|
87
138
|
skip_tokenizer_init=skip_tokenizer_init,
|
88
139
|
trust_remote_code=trust_remote_code,
|
140
|
+
allowed_local_media_path=allowed_local_media_path,
|
89
141
|
tensor_parallel_size=tensor_parallel_size,
|
90
142
|
dtype=dtype,
|
91
143
|
quantization=quantization,
|
@@ -96,16 +148,53 @@ class IPEXLLMClass(LLM):
|
|
96
148
|
swap_space=swap_space,
|
97
149
|
cpu_offload_gb=cpu_offload_gb,
|
98
150
|
enforce_eager=enforce_eager,
|
99
|
-
max_context_len_to_capture=max_context_len_to_capture,
|
100
151
|
max_seq_len_to_capture=max_seq_len_to_capture,
|
101
152
|
disable_custom_all_reduce=disable_custom_all_reduce,
|
153
|
+
disable_async_output_proc=disable_async_output_proc,
|
154
|
+
hf_overrides=hf_overrides,
|
155
|
+
mm_processor_kwargs=mm_processor_kwargs,
|
156
|
+
override_pooler_config=override_pooler_config,
|
157
|
+
compilation_config=compilation_config_instance,
|
102
158
|
**kwargs,
|
103
159
|
)
|
104
|
-
|
160
|
+
# Logic to switch between engines is done at runtime instead of import
|
161
|
+
# to avoid import order issues
|
162
|
+
self.engine_class = self.get_engine_class()
|
163
|
+
self.llm_engine = self.engine_class.from_engine_args(
|
105
164
|
engine_args, usage_context=UsageContext.LLM_CLASS,
|
106
165
|
load_in_low_bit=load_in_low_bit)
|
166
|
+
|
107
167
|
self.request_counter = Counter()
|
108
168
|
|
169
|
+
@staticmethod
|
170
|
+
def get_engine_class() -> Type[LLMEngine]:
|
171
|
+
if envs.VLLM_USE_V1:
|
172
|
+
return IPEXLLMLLMV1Engine
|
173
|
+
return IPEXLLMLLMEngine
|
174
|
+
|
175
|
+
|
176
|
+
class IPEXLLMLLMV1Engine(V1LLMEngine):
|
177
|
+
def __init__(self, *args, **kwargs):
|
178
|
+
super().__init__(*args, **kwargs)
|
179
|
+
|
180
|
+
@classmethod
|
181
|
+
def from_engine_args(
|
182
|
+
cls,
|
183
|
+
engine_args: EngineArgs,
|
184
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
185
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
186
|
+
enable_multiprocessing: bool = False,
|
187
|
+
load_in_low_bit: str = "sym_int4",
|
188
|
+
) -> "LLMEngine":
|
189
|
+
"""Creates an LLM engine from the engine arguments."""
|
190
|
+
# Create the engine configs.
|
191
|
+
|
192
|
+
_ipex_llm_convert(load_in_low_bit)
|
193
|
+
return super().from_engine_args(engine_args,
|
194
|
+
usage_context,
|
195
|
+
stat_loggers,
|
196
|
+
enable_multiprocessing)
|
197
|
+
|
109
198
|
|
110
199
|
class IPEXLLMLLMEngine(LLMEngine):
|
111
200
|
def __init__(self, *args, **kwargs):
|
@@ -134,16 +223,24 @@ class IPEXLLMMQLLMEngine(MQLLMEngine):
|
|
134
223
|
|
135
224
|
|
136
225
|
def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
|
137
|
-
ipc_path: str, load_in_low_bit: str):
|
226
|
+
ipc_path: str, load_in_low_bit: str, engine_alive):
|
138
227
|
|
139
228
|
def signal_handler(*_) -> None:
|
140
229
|
# Interrupt server on sigterm
|
141
230
|
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
|
142
231
|
|
143
|
-
|
232
|
+
try:
|
233
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
234
|
+
|
235
|
+
engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
|
236
|
+
usage_context=usage_context,
|
237
|
+
ipc_path=ipc_path,
|
238
|
+
load_in_low_bit=load_in_low_bit)
|
239
|
+
engine.start()
|
240
|
+
except BaseException as e:
|
241
|
+
logger.exception(e)
|
242
|
+
engine_alive.value = False
|
243
|
+
raise e # noqa
|
144
244
|
|
145
|
-
|
146
|
-
|
147
|
-
ipc_path=ipc_path,
|
148
|
-
load_in_low_bit=load_in_low_bit)
|
149
|
-
engine.start()
|
245
|
+
if os.getenv("VLLM_USE_V1"):
|
246
|
+
IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine
|