ipex-llm 2.2.0b20250210__py3-none-win_amd64.whl → 2.2.0b20250212__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +23 -1
- ipex_llm/transformers/low_bit_linear.py +1 -1
- ipex_llm/transformers/models/baichuan_m1.py +240 -0
- ipex_llm/transformers/models/janus.py +49 -0
- ipex_llm/transformers/models/utils.py +1 -1
- ipex_llm/vllm/xpu/engine/engine.py +117 -20
- ipex_llm/vllm/xpu/entrypoints/openai/api_server.py +379 -95
- ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py +57 -8
- ipex_llm/vllm/xpu/ipex_llm_v1_wrapper.py +23 -0
- ipex_llm/vllm/xpu/model_convert.py +25 -19
- {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/METADATA +19 -19
- {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/RECORD +47 -44
- {ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250210.data → ipex_llm-2.2.0b20250212.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250210.dist-info → ipex_llm-2.2.0b20250212.dist-info}/top_level.txt +0 -0
ipex_llm/libs/bloom-api.dll
CHANGED
Binary file
|
ipex_llm/libs/bloom.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox-api.dll
CHANGED
Binary file
|
ipex_llm/libs/gptneox.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libbloom_vnni.dll
CHANGED
Binary file
|
ipex_llm/libs/libgptneox_avx.dll
CHANGED
Binary file
|
Binary file
|
ipex_llm/libs/libllama_avx.dll
CHANGED
Binary file
|
ipex_llm/libs/libllama_vnni.dll
CHANGED
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/llama-api.dll
CHANGED
Binary file
|
ipex_llm/libs/llama.dll
CHANGED
Binary file
|
ipex_llm/libs/main-bloom.exe
CHANGED
Binary file
|
ipex_llm/libs/main-gptneox.exe
CHANGED
Binary file
|
ipex_llm/libs/main-llama.exe
CHANGED
Binary file
|
ipex_llm/libs/main-starcoder.exe
CHANGED
Binary file
|
ipex_llm/libs/pipeline.dll
CHANGED
Binary file
|
ipex_llm/libs/quantize-bloom.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/quantize-llama.exe
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
ipex_llm/libs/starcoder-api.dll
CHANGED
Binary file
|
ipex_llm/libs/starcoder.dll
CHANGED
Binary file
|
ipex_llm/transformers/convert.py
CHANGED
@@ -667,7 +667,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
|
|
667
667
|
out_features,
|
668
668
|
mp_group,
|
669
669
|
None,
|
670
|
-
None,
|
671
670
|
optimize_lm_head,
|
672
671
|
None
|
673
672
|
)
|
@@ -1062,6 +1061,11 @@ def _optimize_pre(model, qtype=None):
|
|
1062
1061
|
from ipex_llm.transformers.models.glm import merge_qkv, split_mlp
|
1063
1062
|
model.apply(merge_qkv)
|
1064
1063
|
model.apply(split_mlp)
|
1064
|
+
elif model.config.model_type == "baichuan_m1":
|
1065
|
+
from ipex_llm.transformers.models.baichuan_m1 import pre_register_inv_freq
|
1066
|
+
model.apply(pre_register_inv_freq)
|
1067
|
+
elif model.config.model_type == "multi_modality":
|
1068
|
+
_optimize_pre(model.language_model)
|
1065
1069
|
|
1066
1070
|
return model
|
1067
1071
|
|
@@ -1994,5 +1998,23 @@ def _optimize_post(model):
|
|
1994
1998
|
model.llm.config.rope_scaling = {"rope_type": "default"}
|
1995
1999
|
_optimize_post(model.llm)
|
1996
2000
|
model.llm.config.model_type = "megrezo"
|
2001
|
+
elif model.config.model_type == "baichuan_m1":
|
2002
|
+
modeling_module_name = model.__class__.__module__
|
2003
|
+
module = importlib.import_module(modeling_module_name)
|
2004
|
+
from ipex_llm.transformers.models.common import rms_norm_forward
|
2005
|
+
from ipex_llm.transformers.models.baichuan_m1 import model_forward
|
2006
|
+
from ipex_llm.transformers.models.baichuan_m1 import eager_attention_forward
|
2007
|
+
convert_forward(model, module.BaichuanModel, model_forward)
|
2008
|
+
convert_forward(model, module.BaichuanRMSNorm, rms_norm_forward)
|
2009
|
+
convert_forward(model, module.BaichuanAttention, eager_attention_forward)
|
2010
|
+
elif model.config.model_type == "multi_modality":
|
2011
|
+
# vision
|
2012
|
+
vpm_modeling_module_name = model.vision_model.vision_tower.__class__.__module__
|
2013
|
+
vpm_module = importlib.import_module(vpm_modeling_module_name)
|
2014
|
+
from ipex_llm.transformers.models.janus import vision_attention_forward
|
2015
|
+
convert_forward(model.vision_model, vpm_module.Attention, vision_attention_forward)
|
2016
|
+
|
2017
|
+
# llm
|
2018
|
+
_optimize_post(model.language_model)
|
1997
2019
|
|
1998
2020
|
return model
|
@@ -699,7 +699,7 @@ class LowBitLinear(nn.Linear):
|
|
699
699
|
if is_server() and (not is_spr()) and \
|
700
700
|
self.qtype == SYM_INT4 and x_2d.shape[0] >= TORCH_LINEAR_THRESHOLD:
|
701
701
|
x0_fp32 = ggml_int4_convert_fp32(x0, self.weight_shape, self.weight_length)
|
702
|
-
result = F.linear(x, x0_fp32)
|
702
|
+
result = F.linear(x.to(dtype=x0_fp32.dtype), x0_fp32)
|
703
703
|
else:
|
704
704
|
# Weight does not need a convert
|
705
705
|
result = ggml_matmul_src1_x_src0_t(x0, x_2d, self.weight_shape, self.qtype)
|
@@ -0,0 +1,240 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2016 The BigDL Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
# This file is adapted from
|
17
|
+
# https://huggingface.co/baichuan-inc/Baichuan-M1-14B-Instruct/blob/main/modeling_baichuan.py
|
18
|
+
|
19
|
+
|
20
|
+
import math
|
21
|
+
import torch
|
22
|
+
import torch.nn.functional as F
|
23
|
+
|
24
|
+
from typing import Optional, Tuple, Union
|
25
|
+
from transformers.cache_utils import Cache
|
26
|
+
from transformers.modeling_outputs import BaseModelOutputWithPast
|
27
|
+
from ipex_llm.utils.common import invalidInputError
|
28
|
+
from ipex_llm.transformers.models.utils import should_use_fuse_rope, repeat_kv
|
29
|
+
from ipex_llm.transformers.models.common import attention_softmax
|
30
|
+
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
31
|
+
from ipex_llm.transformers.kv import DynamicNormalCache
|
32
|
+
|
33
|
+
|
34
|
+
def pre_register_inv_freq(module: torch.nn.Module):
|
35
|
+
if module.__class__.__name__ == "RotaryEmbedding":
|
36
|
+
inv_freq = module.inv_freq
|
37
|
+
del module.inv_freq
|
38
|
+
module.register_buffer("inv_freq", inv_freq, persistent=False)
|
39
|
+
|
40
|
+
|
41
|
+
# copied from Baichuan M1
|
42
|
+
def custom_convolution(U, K):
|
43
|
+
"""
|
44
|
+
U: Input matrix, shape (bs, seq, h, d)
|
45
|
+
K: Convolution kernel, shape (w, h)
|
46
|
+
Returns: Output matrix V, shape (bs, seq, h, d)
|
47
|
+
"""
|
48
|
+
# h, w = K.shape
|
49
|
+
w = K.size(-1)
|
50
|
+
padding = (w - 1, 0)
|
51
|
+
U_padded = F.pad(U, (0, 0, 0, 0, *padding)) # Shape becomes (bs, seq+w-1, h, d)
|
52
|
+
U_unfolded = U_padded.unfold(1, w, 1) # Shape becomes (bs, seq+w-1, h, d, w)
|
53
|
+
V_unfolded = U_unfolded * K # Shape remains (bs, seq, h, d, w)
|
54
|
+
V = V_unfolded.sum(dim=-1) # Shape becomes (bs, seq, h, d)
|
55
|
+
return V
|
56
|
+
|
57
|
+
|
58
|
+
def model_forward(
|
59
|
+
self,
|
60
|
+
input_ids: torch.LongTensor = None,
|
61
|
+
attention_mask: Optional[torch.Tensor] = None,
|
62
|
+
position_ids: Optional[torch.LongTensor] = None,
|
63
|
+
seqlens: Optional[torch.LongTensor] = None,
|
64
|
+
past_key_values: Optional[Cache] = None,
|
65
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
66
|
+
use_cache: Optional[bool] = None,
|
67
|
+
output_attentions: Optional[bool] = None,
|
68
|
+
output_hidden_states: Optional[bool] = None,
|
69
|
+
return_dict: Optional[bool] = None,
|
70
|
+
cache_position: Optional[torch.LongTensor] = None,
|
71
|
+
) -> Union[Tuple, BaseModelOutputWithPast]:
|
72
|
+
output_attentions = (
|
73
|
+
output_attentions if output_attentions is not None
|
74
|
+
else self.config.output_attentions
|
75
|
+
)
|
76
|
+
output_hidden_states = (
|
77
|
+
output_hidden_states if output_hidden_states is not None
|
78
|
+
else self.config.output_hidden_states
|
79
|
+
)
|
80
|
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
81
|
+
|
82
|
+
invalidInputError((input_ids is None) ^ (inputs_embeds is None),
|
83
|
+
"You cannot specify both input_ids and inputs_embeds at the same time, "
|
84
|
+
"and must specify either one")
|
85
|
+
|
86
|
+
if inputs_embeds is None:
|
87
|
+
inputs_embeds = self.embed_tokens(input_ids)
|
88
|
+
|
89
|
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
90
|
+
use_cache = True if inputs_embeds.device.type == "xpu" else use_cache
|
91
|
+
|
92
|
+
# IPEX-LLM changes start: remove batch multi-pack and use ipex-llm's kv cache
|
93
|
+
# kept for BC (non `Cache` `past_key_values` inputs)
|
94
|
+
if use_cache and not isinstance(past_key_values, DynamicNormalCache):
|
95
|
+
past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
|
96
|
+
# IPEX-LLM changes end
|
97
|
+
|
98
|
+
if cache_position is None:
|
99
|
+
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
100
|
+
cache_position = torch.arange(
|
101
|
+
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1],
|
102
|
+
device=inputs_embeds.device
|
103
|
+
)
|
104
|
+
if position_ids is None:
|
105
|
+
position_ids = cache_position.unsqueeze(0)
|
106
|
+
|
107
|
+
causal_mask = self._update_causal_mask(
|
108
|
+
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
109
|
+
)
|
110
|
+
|
111
|
+
hidden_states = inputs_embeds
|
112
|
+
|
113
|
+
# create position embeddings to be shared across the decoder layers
|
114
|
+
# position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
115
|
+
position_embeddings = None
|
116
|
+
|
117
|
+
# decoder layers
|
118
|
+
all_hidden_states = () if output_hidden_states else None
|
119
|
+
all_self_attns = () if output_attentions else None
|
120
|
+
next_decoder_cache = None
|
121
|
+
|
122
|
+
for decoder_layer in self.layers:
|
123
|
+
if output_hidden_states:
|
124
|
+
all_hidden_states += (hidden_states,)
|
125
|
+
|
126
|
+
layer_outputs = decoder_layer(
|
127
|
+
hidden_states,
|
128
|
+
attention_mask=causal_mask,
|
129
|
+
position_ids=position_ids,
|
130
|
+
seqlens=None,
|
131
|
+
past_key_value=past_key_values,
|
132
|
+
output_attentions=output_attentions,
|
133
|
+
use_cache=use_cache,
|
134
|
+
cache_position=cache_position,
|
135
|
+
position_embeddings=position_embeddings,
|
136
|
+
)
|
137
|
+
|
138
|
+
hidden_states = layer_outputs[0]
|
139
|
+
if use_cache:
|
140
|
+
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
|
141
|
+
|
142
|
+
if output_attentions:
|
143
|
+
all_self_attns += (layer_outputs[1],)
|
144
|
+
|
145
|
+
hidden_states = self.norm(hidden_states)
|
146
|
+
|
147
|
+
# add hidden states from the last decoder layer
|
148
|
+
if output_hidden_states:
|
149
|
+
all_hidden_states += (hidden_states,)
|
150
|
+
|
151
|
+
next_cache = next_decoder_cache if use_cache else None
|
152
|
+
if not return_dict:
|
153
|
+
return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
|
154
|
+
if v is not None)
|
155
|
+
return BaseModelOutputWithPast(
|
156
|
+
last_hidden_state=hidden_states,
|
157
|
+
past_key_values=next_cache,
|
158
|
+
hidden_states=all_hidden_states,
|
159
|
+
attentions=all_self_attns,
|
160
|
+
)
|
161
|
+
|
162
|
+
|
163
|
+
def eager_attention_forward(
|
164
|
+
self,
|
165
|
+
hidden_states: torch.Tensor,
|
166
|
+
attention_mask: Optional[torch.Tensor] = None,
|
167
|
+
position_ids: Optional[torch.LongTensor] = None,
|
168
|
+
seqlens: Optional[torch.LongTensor] = None,
|
169
|
+
past_key_value: Optional[Cache] = None,
|
170
|
+
output_attentions: bool = False,
|
171
|
+
use_cache: bool = False,
|
172
|
+
cache_position: Optional[torch.LongTensor] = None,
|
173
|
+
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]]=None,
|
174
|
+
):
|
175
|
+
invalidInputError(seqlens is None, "`seq_lens` must be None")
|
176
|
+
|
177
|
+
bsz, q_len, _ = hidden_states.size()
|
178
|
+
qkv = self.W_pack(hidden_states)
|
179
|
+
qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
|
180
|
+
query_states, key_states, value_states = qkv.split([self.num_heads,
|
181
|
+
self.num_key_value_heads,
|
182
|
+
self.num_key_value_heads], dim=2)
|
183
|
+
# q, k, v: [bsz, seq_len, num_heads, head_dim]
|
184
|
+
|
185
|
+
if past_key_value is None or past_key_value.get_seq_length(self.layer_idx) == 0: # prefill
|
186
|
+
self.last_k = key_states[:, -1:]
|
187
|
+
self.last_v = value_states[:, -1:]
|
188
|
+
|
189
|
+
key_states = custom_convolution(key_states, self.conv_k)
|
190
|
+
value_states = custom_convolution(value_states, self.conv_v)
|
191
|
+
else:
|
192
|
+
new_key_states = (self.conv_k[0, 0, :, 0, :1] * self.last_k +
|
193
|
+
self.conv_k[0, 0, :, 0, 1:] * key_states)
|
194
|
+
self.last_k = key_states
|
195
|
+
key_states = new_key_states
|
196
|
+
|
197
|
+
new_value_states = (self.conv_v[0, 0, :, 0, : 1] * self.last_v +
|
198
|
+
self.conv_v[0, 0, :, 0, 1:] * value_states)
|
199
|
+
self.last_v = value_states
|
200
|
+
value_states = new_value_states
|
201
|
+
|
202
|
+
query_states = query_states.transpose(1, 2)
|
203
|
+
key_states = key_states.transpose(1, 2)
|
204
|
+
value_states = value_states.transpose(1, 2)
|
205
|
+
# q, k, v: [bsz, num_heads, seq_len, head_dim]
|
206
|
+
|
207
|
+
invalidInputError(should_use_fuse_rope(hidden_states, position_ids, self.training),
|
208
|
+
"fuse rope must be used")
|
209
|
+
import xe_addons
|
210
|
+
xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids,
|
211
|
+
query_states, key_states)
|
212
|
+
|
213
|
+
# ignore sliding window
|
214
|
+
key_states, value_states = past_key_value.update(key_states, value_states,
|
215
|
+
self.layer_idx, None)
|
216
|
+
if self.head_dim <= 128:
|
217
|
+
attn_weights = None
|
218
|
+
attn_output = scaled_dot_product_attention(
|
219
|
+
query_states, key_states, value_states,
|
220
|
+
attention_mask, q_len == key_states.size(2)
|
221
|
+
)
|
222
|
+
else:
|
223
|
+
n_rep = self.num_heads // self.num_key_value_heads
|
224
|
+
key_states = repeat_kv(key_states, n_rep)
|
225
|
+
value_states = repeat_kv(value_states, n_rep)
|
226
|
+
attn_weights = torch.matmul(query_states,
|
227
|
+
key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
|
228
|
+
if attention_mask is not None:
|
229
|
+
attn_weights = attn_weights + attention_mask
|
230
|
+
attn_weights = attention_softmax(attn_weights)
|
231
|
+
attn_output = torch.matmul(attn_weights, value_states)
|
232
|
+
|
233
|
+
attn_output = attn_output.transpose(1, 2).contiguous()
|
234
|
+
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
235
|
+
|
236
|
+
attn_output = self.o_proj(attn_output)
|
237
|
+
|
238
|
+
if not output_attentions:
|
239
|
+
attn_weights = None
|
240
|
+
return attn_output, attn_weights, past_key_value
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2016 The BigDL Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
# This file is adapted from
|
17
|
+
# https://github.com/deepseek-ai/Janus/blob/main/janus/models/siglip_vit.py
|
18
|
+
|
19
|
+
import torch
|
20
|
+
|
21
|
+
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
22
|
+
|
23
|
+
|
24
|
+
def vision_attention_forward(self, x: torch.Tensor) -> torch.Tensor:
|
25
|
+
B, N, C = x.shape
|
26
|
+
qkv = (
|
27
|
+
self.qkv(x)
|
28
|
+
.reshape(B, N, 3, self.num_heads, self.head_dim)
|
29
|
+
.permute(2, 0, 3, 1, 4)
|
30
|
+
)
|
31
|
+
q, k, v = qkv.unbind(0)
|
32
|
+
q, k = self.q_norm(q), self.k_norm(k)
|
33
|
+
|
34
|
+
if self.fused_attn:
|
35
|
+
# ipex-llm opt: sdpa
|
36
|
+
x = scaled_dot_product_attention(
|
37
|
+
q, k.contiguous(), v.contiguous(), None, False
|
38
|
+
)
|
39
|
+
else:
|
40
|
+
q = q * self.scale
|
41
|
+
attn = q @ k.transpose(-2, -1)
|
42
|
+
attn = attn.softmax(dim=-1)
|
43
|
+
attn = self.attn_drop(attn)
|
44
|
+
x = attn @ v
|
45
|
+
|
46
|
+
x = x.transpose(1, 2).reshape(B, N, C)
|
47
|
+
x = self.proj(x)
|
48
|
+
x = self.proj_drop(x)
|
49
|
+
return x
|
@@ -86,7 +86,7 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor,
|
|
86
86
|
return os.environ["IPEX_LLM_QUANTIZE_KV_CACHE"] == "1"
|
87
87
|
elif os.environ.get("IPEX_LLM_LOW_MEM", None) is not None:
|
88
88
|
return os.environ["IPEX_LLM_LOW_MEM"] == "1"
|
89
|
-
elif linear.
|
89
|
+
elif linear.weight.dtype != torch.uint8: # unquantized
|
90
90
|
return False
|
91
91
|
else:
|
92
92
|
device_name = get_xpu_device_name(x.device)
|
@@ -13,18 +13,28 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
#
|
16
|
-
from
|
16
|
+
from vllm.logger import init_logger
|
17
|
+
from typing import Dict, Optional, Any, Union, Type
|
17
18
|
from vllm.engine.llm_engine import LLMEngine
|
18
19
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
19
20
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
20
21
|
from vllm.entrypoints.llm import LLM
|
21
22
|
from vllm.utils import Counter
|
22
|
-
from vllm.config import
|
23
|
+
from vllm.config import VllmConfig
|
23
24
|
from ipex_llm.vllm.xpu.model_convert import _ipex_llm_convert
|
24
25
|
from vllm.usage.usage_lib import UsageContext
|
25
26
|
from vllm.engine.metrics import StatLoggerBase
|
26
27
|
from vllm.engine.multiprocessing.engine import MQLLMEngine
|
27
28
|
import signal
|
29
|
+
from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
|
30
|
+
TaskOption)
|
31
|
+
from vllm.config import CompilationConfig
|
32
|
+
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
33
|
+
from vllm import envs
|
34
|
+
from vllm.v1.engine.async_llm import AsyncLLM
|
35
|
+
import os
|
36
|
+
|
37
|
+
logger = init_logger(__name__)
|
28
38
|
|
29
39
|
|
30
40
|
class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
@@ -35,7 +45,7 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
|
35
45
|
def from_engine_args(
|
36
46
|
cls,
|
37
47
|
engine_args: AsyncEngineArgs,
|
38
|
-
engine_config: Optional[
|
48
|
+
engine_config: Optional[VllmConfig] = None,
|
39
49
|
start_engine_loop: bool = True,
|
40
50
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
41
51
|
load_in_low_bit: str = "sym_int4",
|
@@ -49,6 +59,27 @@ class IPEXLLMAsyncLLMEngine(AsyncLLMEngine):
|
|
49
59
|
usage_context=usage_context, stat_loggers=stat_loggers)
|
50
60
|
|
51
61
|
|
62
|
+
class IPEXLLMAsyncV1Engine(AsyncLLM):
|
63
|
+
|
64
|
+
def __init__(self, *args, **kwargs):
|
65
|
+
super().__init__(*args, **kwargs)
|
66
|
+
|
67
|
+
@classmethod
|
68
|
+
def from_engine_args(
|
69
|
+
cls,
|
70
|
+
engine_args: AsyncEngineArgs,
|
71
|
+
engine_config: Optional[VllmConfig] = None,
|
72
|
+
start_engine_loop: bool = True,
|
73
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
74
|
+
load_in_low_bit: str = "sym_int4",
|
75
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None, # noqa
|
76
|
+
) -> "AsyncLLM":
|
77
|
+
_ipex_llm_convert(load_in_low_bit)
|
78
|
+
return super().from_engine_args(engine_args=engine_args, engine_config=engine_config,
|
79
|
+
start_engine_loop=start_engine_loop,
|
80
|
+
usage_context=usage_context, stat_loggers=stat_loggers)
|
81
|
+
|
82
|
+
|
52
83
|
class IPEXLLMClass(LLM):
|
53
84
|
def __init__(
|
54
85
|
self,
|
@@ -57,6 +88,7 @@ class IPEXLLMClass(LLM):
|
|
57
88
|
tokenizer_mode: str = "auto",
|
58
89
|
skip_tokenizer_init: bool = False,
|
59
90
|
trust_remote_code: bool = False,
|
91
|
+
allowed_local_media_path: str = "",
|
60
92
|
tensor_parallel_size: int = 1,
|
61
93
|
dtype: str = "auto",
|
62
94
|
quantization: Optional[str] = None,
|
@@ -64,28 +96,48 @@ class IPEXLLMClass(LLM):
|
|
64
96
|
tokenizer_revision: Optional[str] = None,
|
65
97
|
seed: int = 0,
|
66
98
|
gpu_memory_utilization: float = 0.9,
|
67
|
-
swap_space:
|
99
|
+
swap_space: float = 4,
|
68
100
|
cpu_offload_gb: float = 0,
|
69
|
-
enforce_eager: bool =
|
70
|
-
max_context_len_to_capture: Optional[int] = None,
|
101
|
+
enforce_eager: Optional[bool] = None,
|
71
102
|
max_seq_len_to_capture: int = 8192,
|
72
103
|
disable_custom_all_reduce: bool = False,
|
104
|
+
disable_async_output_proc: bool = True,
|
105
|
+
hf_overrides: Optional[HfOverrides] = None,
|
106
|
+
mm_processor_kwargs: Optional[Dict[str, Any]]=None,
|
107
|
+
# After positional args are removed, move this right below `model`
|
108
|
+
task: TaskOption = "auto",
|
109
|
+
override_pooler_config: Optional[PoolerConfig] = None,
|
110
|
+
compilation_config: Optional[Union[int, Dict[str, Any]]]=None,
|
73
111
|
load_in_low_bit: str = "sym_int4",
|
74
112
|
**kwargs,
|
75
113
|
) -> None:
|
114
|
+
'''
|
115
|
+
LLM constructor.
|
116
|
+
|
117
|
+
Note: if enforce_eager is unset (enforce_eager is None)
|
118
|
+
it defaults to False.
|
119
|
+
'''
|
120
|
+
|
76
121
|
if "disable_log_stats" not in kwargs:
|
77
122
|
kwargs["disable_log_stats"] = True
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
123
|
+
|
124
|
+
if compilation_config is not None:
|
125
|
+
if isinstance(compilation_config, (int, dict)):
|
126
|
+
compilation_config_instance = CompilationConfig.from_cli(
|
127
|
+
str(compilation_config))
|
128
|
+
else:
|
129
|
+
compilation_config_instance = compilation_config
|
130
|
+
else:
|
131
|
+
compilation_config_instance = None
|
132
|
+
|
83
133
|
engine_args = EngineArgs(
|
84
134
|
model=model,
|
135
|
+
task=task,
|
85
136
|
tokenizer=tokenizer,
|
86
137
|
tokenizer_mode=tokenizer_mode,
|
87
138
|
skip_tokenizer_init=skip_tokenizer_init,
|
88
139
|
trust_remote_code=trust_remote_code,
|
140
|
+
allowed_local_media_path=allowed_local_media_path,
|
89
141
|
tensor_parallel_size=tensor_parallel_size,
|
90
142
|
dtype=dtype,
|
91
143
|
quantization=quantization,
|
@@ -96,16 +148,53 @@ class IPEXLLMClass(LLM):
|
|
96
148
|
swap_space=swap_space,
|
97
149
|
cpu_offload_gb=cpu_offload_gb,
|
98
150
|
enforce_eager=enforce_eager,
|
99
|
-
max_context_len_to_capture=max_context_len_to_capture,
|
100
151
|
max_seq_len_to_capture=max_seq_len_to_capture,
|
101
152
|
disable_custom_all_reduce=disable_custom_all_reduce,
|
153
|
+
disable_async_output_proc=disable_async_output_proc,
|
154
|
+
hf_overrides=hf_overrides,
|
155
|
+
mm_processor_kwargs=mm_processor_kwargs,
|
156
|
+
override_pooler_config=override_pooler_config,
|
157
|
+
compilation_config=compilation_config_instance,
|
102
158
|
**kwargs,
|
103
159
|
)
|
104
|
-
|
160
|
+
# Logic to switch between engines is done at runtime instead of import
|
161
|
+
# to avoid import order issues
|
162
|
+
self.engine_class = self.get_engine_class()
|
163
|
+
self.llm_engine = self.engine_class.from_engine_args(
|
105
164
|
engine_args, usage_context=UsageContext.LLM_CLASS,
|
106
165
|
load_in_low_bit=load_in_low_bit)
|
166
|
+
|
107
167
|
self.request_counter = Counter()
|
108
168
|
|
169
|
+
@staticmethod
|
170
|
+
def get_engine_class() -> Type[LLMEngine]:
|
171
|
+
if envs.VLLM_USE_V1:
|
172
|
+
return IPEXLLMLLMV1Engine
|
173
|
+
return IPEXLLMLLMEngine
|
174
|
+
|
175
|
+
|
176
|
+
class IPEXLLMLLMV1Engine(V1LLMEngine):
|
177
|
+
def __init__(self, *args, **kwargs):
|
178
|
+
super().__init__(*args, **kwargs)
|
179
|
+
|
180
|
+
@classmethod
|
181
|
+
def from_engine_args(
|
182
|
+
cls,
|
183
|
+
engine_args: EngineArgs,
|
184
|
+
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
185
|
+
stat_loggers: Optional[Dict[str, StatLoggerBase]]=None,
|
186
|
+
enable_multiprocessing: bool = False,
|
187
|
+
load_in_low_bit: str = "sym_int4",
|
188
|
+
) -> "LLMEngine":
|
189
|
+
"""Creates an LLM engine from the engine arguments."""
|
190
|
+
# Create the engine configs.
|
191
|
+
|
192
|
+
_ipex_llm_convert(load_in_low_bit)
|
193
|
+
return super().from_engine_args(engine_args,
|
194
|
+
usage_context,
|
195
|
+
stat_loggers,
|
196
|
+
enable_multiprocessing)
|
197
|
+
|
109
198
|
|
110
199
|
class IPEXLLMLLMEngine(LLMEngine):
|
111
200
|
def __init__(self, *args, **kwargs):
|
@@ -134,16 +223,24 @@ class IPEXLLMMQLLMEngine(MQLLMEngine):
|
|
134
223
|
|
135
224
|
|
136
225
|
def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
|
137
|
-
ipc_path: str, load_in_low_bit: str):
|
226
|
+
ipc_path: str, load_in_low_bit: str, engine_alive):
|
138
227
|
|
139
228
|
def signal_handler(*_) -> None:
|
140
229
|
# Interrupt server on sigterm
|
141
230
|
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa
|
142
231
|
|
143
|
-
|
232
|
+
try:
|
233
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
234
|
+
|
235
|
+
engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
|
236
|
+
usage_context=usage_context,
|
237
|
+
ipc_path=ipc_path,
|
238
|
+
load_in_low_bit=load_in_low_bit)
|
239
|
+
engine.start()
|
240
|
+
except BaseException as e:
|
241
|
+
logger.exception(e)
|
242
|
+
engine_alive.value = False
|
243
|
+
raise e # noqa
|
144
244
|
|
145
|
-
|
146
|
-
|
147
|
-
ipc_path=ipc_path,
|
148
|
-
load_in_low_bit=load_in_low_bit)
|
149
|
-
engine.start()
|
245
|
+
if os.getenv("VLLM_USE_V1"):
|
246
|
+
IPEXLLMAsyncLLMEngine = IPEXLLMAsyncV1Engine
|