ipex-llm 2.2.0b20250223__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250225__py3-none-manylinux2010_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/transformers/convert.py +14 -1
- ipex_llm/transformers/models/common.py +27 -0
- ipex_llm/transformers/models/deepseek.py +303 -0
- ipex_llm/transformers/models/minicpm3.py +25 -3
- ipex_llm/transformers/xgrammar.py +47 -0
- {ipex_llm-2.2.0b20250223.dist-info → ipex_llm-2.2.0b20250225.dist-info}/METADATA +19 -19
- {ipex_llm-2.2.0b20250223.dist-info → ipex_llm-2.2.0b20250225.dist-info}/RECORD +13 -11
- {ipex_llm-2.2.0b20250223.data → ipex_llm-2.2.0b20250225.data}/scripts/ipex-llm-init +0 -0
- {ipex_llm-2.2.0b20250223.data → ipex_llm-2.2.0b20250225.data}/scripts/llm-chat +0 -0
- {ipex_llm-2.2.0b20250223.data → ipex_llm-2.2.0b20250225.data}/scripts/llm-cli +0 -0
- {ipex_llm-2.2.0b20250223.dist-info → ipex_llm-2.2.0b20250225.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250223.dist-info → ipex_llm-2.2.0b20250225.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250223.dist-info → ipex_llm-2.2.0b20250225.dist-info}/top_level.txt +0 -0
ipex_llm/transformers/convert.py
CHANGED
@@ -1070,7 +1070,9 @@ def _optimize_pre(model, qtype=None):
|
|
1070
1070
|
model.apply(pre_register_inv_freq)
|
1071
1071
|
elif model.config.model_type == "multi_modality":
|
1072
1072
|
_optimize_pre(model.language_model)
|
1073
|
-
|
1073
|
+
elif model.config.model_type == "deepseek_v3" and model.config.hidden_size == 2048:
|
1074
|
+
from ipex_llm.transformers.models.deepseek import padding_mla_v_hd
|
1075
|
+
model.apply(padding_mla_v_hd)
|
1074
1076
|
return model
|
1075
1077
|
|
1076
1078
|
|
@@ -2023,6 +2025,17 @@ def _optimize_post(model):
|
|
2023
2025
|
|
2024
2026
|
# llm
|
2025
2027
|
_optimize_post(model.language_model)
|
2028
|
+
elif model.config.model_type == "deepseek_v3" and model.config.hidden_size == 2048:
|
2029
|
+
modeling_module_name = model.__class__.__module__
|
2030
|
+
module = importlib.import_module(modeling_module_name)
|
2031
|
+
from ipex_llm.transformers.models.common import rms_norm_forward
|
2032
|
+
from ipex_llm.transformers.models.deepseek import deepseek_model_forward
|
2033
|
+
from ipex_llm.transformers.models.deepseek import deepseek_attention_forward
|
2034
|
+
from ipex_llm.transformers.models.deepseek import deepseek_moe_forward
|
2035
|
+
convert_forward(model, module.DeepseekV3RMSNorm, rms_norm_forward)
|
2036
|
+
convert_forward(model, module.DeepseekV3Model, deepseek_model_forward)
|
2037
|
+
convert_forward(model, module.DeepseekV3Attention, deepseek_attention_forward)
|
2038
|
+
convert_forward(model, module.DeepseekV3MoE, deepseek_moe_forward)
|
2026
2039
|
|
2027
2040
|
return model
|
2028
2041
|
|
@@ -95,6 +95,33 @@ def padding_attention_hd_base(module: torch.nn.Module, attention_class,
|
|
95
95
|
module.old_head_dim = old_head_dim
|
96
96
|
|
97
97
|
|
98
|
+
def padding_mla_v_hd_base(module: torch.nn.Module, attention_class):
|
99
|
+
if (
|
100
|
+
isinstance(attention_class, str) and module.__class__.__name__ == attention_class
|
101
|
+
or not isinstance(attention_class, str) and isinstance(module, attention_class)
|
102
|
+
):
|
103
|
+
k_head_dim = module.q_head_dim
|
104
|
+
v_head_dim = module.v_head_dim
|
105
|
+
if v_head_dim < k_head_dim:
|
106
|
+
kv_b_proj = module.kv_b_proj
|
107
|
+
w = kv_b_proj.weight.data.view(module.num_heads,
|
108
|
+
module.qk_nope_head_dim + module.v_head_dim,
|
109
|
+
module.kv_lora_rank)
|
110
|
+
k_w, v_w = w.split([module.qk_nope_head_dim, module.v_head_dim], dim=1)
|
111
|
+
new_v_w = torch.zeros([module.num_heads, k_head_dim, module.kv_lora_rank],
|
112
|
+
dtype=v_w.dtype, device=v_w.device)
|
113
|
+
new_v_w[:, :v_head_dim, :] = v_w
|
114
|
+
new_w = torch.cat([k_w, new_v_w], dim=1).view(-1, module.kv_lora_rank)
|
115
|
+
|
116
|
+
new_kv_b_proj = torch.nn.Linear(0, 0, bias=False,
|
117
|
+
dtype=new_w.dtype, device=new_w.device)
|
118
|
+
new_kv_b_proj.in_features = new_w.size(1)
|
119
|
+
new_kv_b_proj.out_features = new_w.size(0)
|
120
|
+
new_kv_b_proj.weight = torch.nn.Parameter(new_w, False)
|
121
|
+
|
122
|
+
module.kv_b_proj = new_kv_b_proj
|
123
|
+
|
124
|
+
|
98
125
|
def padding_states_hd(states: torch.Tensor, old_head_dim: int, new_head_dim: int):
|
99
126
|
bsz, num_heads, seq_len, head_dim = states.size()
|
100
127
|
if head_dim == old_head_dim and old_head_dim < new_head_dim:
|
@@ -0,0 +1,303 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2016 The BigDL Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
#
|
16
|
+
# Some parts of this file is adapted from
|
17
|
+
# https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py
|
18
|
+
# which is licensed under Apache License 2.0:
|
19
|
+
#
|
20
|
+
# https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE
|
21
|
+
#
|
22
|
+
|
23
|
+
import torch
|
24
|
+
import warnings
|
25
|
+
|
26
|
+
from typing import Optional, Tuple, List, Union
|
27
|
+
from transformers.cache_utils import Cache
|
28
|
+
from transformers.modeling_outputs import BaseModelOutputWithPast
|
29
|
+
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
|
30
|
+
|
31
|
+
from ipex_llm.utils.common.log4Error import invalidInputError
|
32
|
+
from ipex_llm.transformers.kv import DynamicNormalCache
|
33
|
+
from ipex_llm.transformers.models.common import padding_mla_v_hd_base
|
34
|
+
from ipex_llm.transformers.models.common import scaled_dot_product_attention
|
35
|
+
from ipex_llm.transformers.models.utils import rotate_half
|
36
|
+
|
37
|
+
|
38
|
+
def padding_mla_v_hd(module: torch.nn.Module):
|
39
|
+
padding_mla_v_hd_base(module, "DeepseekV3Attention")
|
40
|
+
|
41
|
+
|
42
|
+
def deepseek_model_forward(
|
43
|
+
self,
|
44
|
+
input_ids: torch.LongTensor = None,
|
45
|
+
attention_mask: Optional[torch.Tensor] = None,
|
46
|
+
position_ids: Optional[torch.LongTensor] = None,
|
47
|
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
48
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
49
|
+
use_cache: Optional[bool] = None,
|
50
|
+
output_attentions: Optional[bool] = None,
|
51
|
+
output_hidden_states: Optional[bool] = None,
|
52
|
+
return_dict: Optional[bool] = None,
|
53
|
+
) -> Union[Tuple, BaseModelOutputWithPast]:
|
54
|
+
output_attentions = (
|
55
|
+
output_attentions if output_attentions is not None
|
56
|
+
else self.config.output_attentions
|
57
|
+
)
|
58
|
+
output_hidden_states = (
|
59
|
+
output_hidden_states if output_hidden_states is not None
|
60
|
+
else self.config.output_hidden_states
|
61
|
+
)
|
62
|
+
|
63
|
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
64
|
+
|
65
|
+
return_dict = (
|
66
|
+
return_dict if return_dict is not None else self.config.use_return_dict
|
67
|
+
)
|
68
|
+
|
69
|
+
# retrieve input_ids and inputs_embeds
|
70
|
+
invalidInputError((input_ids is None) ^ (inputs_embeds is None),
|
71
|
+
"You cannot specify both input_ids and inputs_embeds at the same time, "
|
72
|
+
"and must specify either one")
|
73
|
+
|
74
|
+
if inputs_embeds is None:
|
75
|
+
inputs_embeds = self.embed_tokens(input_ids)
|
76
|
+
|
77
|
+
batch_size, seq_length = inputs_embeds.shape[:2]
|
78
|
+
|
79
|
+
# IPEX-LLM OPT start: kv cache
|
80
|
+
past_key_values_length = 0
|
81
|
+
use_cache = True if inputs_embeds.device.type == "xpu" else use_cache
|
82
|
+
if use_cache:
|
83
|
+
if not isinstance(past_key_values, DynamicNormalCache):
|
84
|
+
past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
|
85
|
+
past_key_values_length = past_key_values.get_usable_length(seq_length)
|
86
|
+
# IPEX-LLM OPT end: kv cache
|
87
|
+
|
88
|
+
if position_ids is None:
|
89
|
+
position_ids = torch.arange(
|
90
|
+
past_key_values_length,
|
91
|
+
seq_length + past_key_values_length,
|
92
|
+
dtype=torch.long,
|
93
|
+
device=inputs_embeds.device,
|
94
|
+
)
|
95
|
+
position_ids = position_ids.unsqueeze(0)
|
96
|
+
|
97
|
+
# IPEX-LLM OPT start: fuse rope
|
98
|
+
if inputs_embeds.device.type == "xpu" and position_ids is not None:
|
99
|
+
cos, sin = self.layers[0].self_attn.rotary_emb(inputs_embeds,
|
100
|
+
seq_length + past_key_values_length)
|
101
|
+
cos = cos[position_ids[0]].contiguous()
|
102
|
+
sin = sin[position_ids[0]].contiguous()
|
103
|
+
position_embeddings = (cos, sin)
|
104
|
+
else:
|
105
|
+
position_embeddings = None
|
106
|
+
# IPEX-LLM OPT end: fuse rope
|
107
|
+
|
108
|
+
# 4d mask is passed through the layers
|
109
|
+
attention_mask = _prepare_4d_causal_attention_mask(
|
110
|
+
attention_mask,
|
111
|
+
(batch_size, seq_length),
|
112
|
+
inputs_embeds,
|
113
|
+
past_key_values_length,
|
114
|
+
)
|
115
|
+
|
116
|
+
# embed positions
|
117
|
+
hidden_states = inputs_embeds
|
118
|
+
|
119
|
+
# decoder layers
|
120
|
+
all_hidden_states = () if output_hidden_states else None
|
121
|
+
all_self_attns = () if output_attentions else None
|
122
|
+
next_decoder_cache = None
|
123
|
+
|
124
|
+
for decoder_layer in self.layers:
|
125
|
+
if output_hidden_states:
|
126
|
+
all_hidden_states += (hidden_states,)
|
127
|
+
|
128
|
+
layer_outputs = decoder_layer(
|
129
|
+
hidden_states,
|
130
|
+
attention_mask=attention_mask,
|
131
|
+
position_ids=position_ids,
|
132
|
+
past_key_value=past_key_values,
|
133
|
+
output_attentions=output_attentions,
|
134
|
+
use_cache=use_cache,
|
135
|
+
position_embeddings=position_embeddings,
|
136
|
+
)
|
137
|
+
|
138
|
+
hidden_states = layer_outputs[0]
|
139
|
+
|
140
|
+
if use_cache:
|
141
|
+
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
|
142
|
+
|
143
|
+
if output_attentions:
|
144
|
+
all_self_attns += (layer_outputs[1],)
|
145
|
+
|
146
|
+
hidden_states = self.norm(hidden_states)
|
147
|
+
|
148
|
+
# add hidden states from the last decoder layer
|
149
|
+
if output_hidden_states:
|
150
|
+
all_hidden_states += (hidden_states,)
|
151
|
+
|
152
|
+
next_cache = next_decoder_cache
|
153
|
+
if not return_dict:
|
154
|
+
return tuple(
|
155
|
+
v
|
156
|
+
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
|
157
|
+
if v is not None
|
158
|
+
)
|
159
|
+
return BaseModelOutputWithPast(
|
160
|
+
last_hidden_state=hidden_states,
|
161
|
+
past_key_values=next_cache,
|
162
|
+
hidden_states=all_hidden_states,
|
163
|
+
attentions=all_self_attns,
|
164
|
+
)
|
165
|
+
|
166
|
+
|
167
|
+
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
168
|
+
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
|
169
|
+
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
|
170
|
+
|
171
|
+
b, h, s, d = q.shape
|
172
|
+
q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
|
173
|
+
|
174
|
+
b, h, s, d = k.shape
|
175
|
+
k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
|
176
|
+
|
177
|
+
q_embed = (q * cos) + (rotate_half(q) * sin)
|
178
|
+
k_embed = (k * cos) + (rotate_half(k) * sin)
|
179
|
+
return q_embed, k_embed
|
180
|
+
|
181
|
+
|
182
|
+
def deepseek_attention_forward(
|
183
|
+
self,
|
184
|
+
hidden_states: torch.Tensor,
|
185
|
+
attention_mask: Optional[torch.Tensor] = None,
|
186
|
+
position_ids: Optional[torch.LongTensor] = None,
|
187
|
+
past_key_value: Optional[Cache] = None,
|
188
|
+
output_attentions: bool = False,
|
189
|
+
use_cache: bool = False,
|
190
|
+
**kwargs,
|
191
|
+
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
192
|
+
if "padding_mask" in kwargs:
|
193
|
+
warnings.warn(
|
194
|
+
"Passing `padding_mask` is deprecated and will be removed in v4.37. "
|
195
|
+
"Please make sure use `attention_mask` instead.`"
|
196
|
+
)
|
197
|
+
|
198
|
+
bsz, q_len, _ = hidden_states.size()
|
199
|
+
|
200
|
+
if self.q_lora_rank is None:
|
201
|
+
q = self.q_proj(hidden_states)
|
202
|
+
else:
|
203
|
+
q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
|
204
|
+
q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
|
205
|
+
|
206
|
+
compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
|
207
|
+
compressed_kv, k_pe = torch.split(
|
208
|
+
compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
|
209
|
+
)
|
210
|
+
k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
|
211
|
+
kv = (
|
212
|
+
self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
|
213
|
+
.view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.q_head_dim)
|
214
|
+
.transpose(1, 2)
|
215
|
+
)
|
216
|
+
|
217
|
+
k_nope, value_states = torch.split(
|
218
|
+
kv, [self.qk_nope_head_dim, self.q_head_dim], dim=-1
|
219
|
+
)
|
220
|
+
kv_seq_len = value_states.shape[-2]
|
221
|
+
if past_key_value is not None:
|
222
|
+
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
|
223
|
+
|
224
|
+
position_embeddings = kwargs.get("position_embeddings", None)
|
225
|
+
if position_embeddings is not None:
|
226
|
+
query_states = q
|
227
|
+
key_states = torch.cat(
|
228
|
+
[k_nope, k_pe.expand([-1, self.num_heads, -1, -1])],
|
229
|
+
dim=-1
|
230
|
+
)
|
231
|
+
import xe_addons
|
232
|
+
cos, sin = position_embeddings
|
233
|
+
xe_addons.rotary_two_with_cache_inplaced(query_states[:, :, :, self.qk_nope_head_dim:],
|
234
|
+
key_states[:, :, :, self.qk_nope_head_dim:],
|
235
|
+
cos, sin, True)
|
236
|
+
else:
|
237
|
+
q_nope, q_pe = torch.split(
|
238
|
+
q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
|
239
|
+
)
|
240
|
+
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
241
|
+
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
242
|
+
|
243
|
+
query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
|
244
|
+
query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
|
245
|
+
query_states[:, :, :, self.qk_nope_head_dim:] = q_pe
|
246
|
+
|
247
|
+
key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
|
248
|
+
key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
|
249
|
+
key_states[:, :, :, self.qk_nope_head_dim:] = k_pe
|
250
|
+
|
251
|
+
if past_key_value is not None:
|
252
|
+
key_states, value_states = past_key_value.update(key_states, value_states,
|
253
|
+
self.layer_idx, None)
|
254
|
+
|
255
|
+
attn_weights = None
|
256
|
+
attn_output = scaled_dot_product_attention(
|
257
|
+
query_states, key_states, value_states,
|
258
|
+
attention_mask, q_len == kv_seq_len, self.softmax_scale
|
259
|
+
)
|
260
|
+
attn_output = attn_output[:, :, :, :self.v_head_dim]
|
261
|
+
|
262
|
+
attn_output = attn_output.transpose(1, 2).contiguous()
|
263
|
+
|
264
|
+
attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
|
265
|
+
|
266
|
+
attn_output = self.o_proj(attn_output)
|
267
|
+
|
268
|
+
if not output_attentions:
|
269
|
+
attn_weights = None
|
270
|
+
|
271
|
+
return attn_output, attn_weights, past_key_value
|
272
|
+
|
273
|
+
|
274
|
+
def moe_infer_decode(self, x: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor):
|
275
|
+
idxs = topk_ids.flatten().tolist()
|
276
|
+
outputs = []
|
277
|
+
for i in idxs:
|
278
|
+
expert = self.experts[i]
|
279
|
+
expert_out = expert(x)
|
280
|
+
outputs.append(expert_out)
|
281
|
+
outs = torch.cat(outputs, dim=0)
|
282
|
+
reshaped_topk_weight = topk_weight.squeeze(0).unsqueeze(-1).to(outs.dtype)
|
283
|
+
final_out = (outs * reshaped_topk_weight).sum(dim=0, keepdim=True)
|
284
|
+
return final_out
|
285
|
+
|
286
|
+
|
287
|
+
def deepseek_moe_forward(self, hidden_states: torch.Tensor):
|
288
|
+
identity = hidden_states
|
289
|
+
orig_shape = hidden_states.shape
|
290
|
+
topk_idx, topk_weight = self.gate(hidden_states)
|
291
|
+
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
292
|
+
flat_topk_idx = topk_idx.view(-1)
|
293
|
+
if not self.training:
|
294
|
+
# IPEX-LLM OPT start : add special moe_infer implementation for decoding
|
295
|
+
if topk_idx.size(0) == 1:
|
296
|
+
y = moe_infer_decode(self, hidden_states, topk_idx, topk_weight)
|
297
|
+
else:
|
298
|
+
y = self.moe_infer(hidden_states, topk_idx, topk_weight)
|
299
|
+
y = y.view(*orig_shape)
|
300
|
+
# IPEX-LLM OPT end
|
301
|
+
if self.config.n_shared_experts is not None:
|
302
|
+
y = y + self.shared_experts(identity)
|
303
|
+
return y
|
@@ -1,3 +1,25 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2016 The BigDL Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
#
|
16
|
+
# Some parts of this file is adapted from
|
17
|
+
# https://hf-mirror.com/openbmb/MiniCPM3-4B/blob/main/modeling_minicpm.py
|
18
|
+
# which is licensed under Apache License 2.0:
|
19
|
+
#
|
20
|
+
# https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE
|
21
|
+
#
|
22
|
+
|
1
23
|
import torch
|
2
24
|
import warnings
|
3
25
|
|
@@ -122,9 +144,6 @@ def minicpm3_attention_forward(
|
|
122
144
|
|
123
145
|
q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
|
124
146
|
q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
|
125
|
-
q_nope, q_pe = torch.split(
|
126
|
-
q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
|
127
|
-
)
|
128
147
|
|
129
148
|
compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
|
130
149
|
compressed_kv, k_pe = torch.split(
|
@@ -169,6 +188,9 @@ def minicpm3_attention_forward(
|
|
169
188
|
else:
|
170
189
|
invalidInputError(f"unknown rope method: {self.rotary_emb.__class__.__name__}")
|
171
190
|
else:
|
191
|
+
q_nope, q_pe = torch.split(
|
192
|
+
q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
|
193
|
+
)
|
172
194
|
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
173
195
|
q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
|
174
196
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#
|
2
|
+
# Copyright 2016 The BigDL Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
#
|
16
|
+
|
17
|
+
|
18
|
+
from transformers import PreTrainedTokenizer, LogitsProcessor
|
19
|
+
from ipex_llm.utils.modules import insert_fake_module
|
20
|
+
|
21
|
+
insert_fake_module("xgrammar.kernels.apply_token_bitmask_inplace_cuda")
|
22
|
+
insert_fake_module("xgrammar.kernels.apply_token_bitmask_inplace_triton")
|
23
|
+
insert_fake_module(
|
24
|
+
"xgrammar.kernels.apply_token_bitmask_inplace_cuda.apply_token_bitmask_inplace_cuda"
|
25
|
+
)
|
26
|
+
insert_fake_module(
|
27
|
+
"xgrammar.kernels.apply_token_bitmask_inplace_triton.apply_token_bitmask_inplace_triton"
|
28
|
+
)
|
29
|
+
|
30
|
+
import xgrammar as xgr
|
31
|
+
|
32
|
+
|
33
|
+
def create_json_logits_processor(tokenizer: PreTrainedTokenizer, vocab_size: int, schema=None):
|
34
|
+
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size)
|
35
|
+
grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
|
36
|
+
if schema is None:
|
37
|
+
compiled_grammar = grammar_compiler.compile_builtin_json_grammar()
|
38
|
+
else:
|
39
|
+
compiled_grammar = grammar_compiler.compile_json_schema(schema)
|
40
|
+
processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)
|
41
|
+
return processor
|
42
|
+
|
43
|
+
|
44
|
+
def reset_json_logits_processor(processor: LogitsProcessor) -> LogitsProcessor:
|
45
|
+
compiled_grammar = processor.compiled_grammar
|
46
|
+
new_processor = xgr.contrib.hf.LogitsProcessor(compiled_grammar)
|
47
|
+
return new_processor
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ipex-llm
|
3
|
-
Version: 2.2.
|
3
|
+
Version: 2.2.0b20250225
|
4
4
|
Summary: Large Language Model Develop Toolkit
|
5
5
|
Home-page: https://github.com/intel-analytics/ipex-llm
|
6
6
|
Author: BigDL Authors
|
@@ -27,7 +27,7 @@ Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine
|
|
27
27
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'all'
|
28
28
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'all'
|
29
29
|
Provides-Extra: cpp
|
30
|
-
Requires-Dist: bigdl-core-cpp ==2.6.
|
30
|
+
Requires-Dist: bigdl-core-cpp ==2.6.0b20250225 ; extra == 'cpp'
|
31
31
|
Requires-Dist: setuptools ; extra == 'cpp'
|
32
32
|
Requires-Dist: onednn-devel ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
33
33
|
Requires-Dist: onednn ==2025.0.1 ; (platform_system == "Windows") and extra == 'cpp'
|
@@ -60,7 +60,7 @@ Requires-Dist: transformers ==4.40.0 ; extra == 'npu'
|
|
60
60
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'npu'
|
61
61
|
Requires-Dist: torch ==2.1.2+cpu ; (platform_system == "Linux") and extra == 'npu'
|
62
62
|
Requires-Dist: torch ==2.1.2 ; (platform_system == "Windows") and extra == 'npu'
|
63
|
-
Requires-Dist: bigdl-core-npu ==2.6.
|
63
|
+
Requires-Dist: bigdl-core-npu ==2.6.0b20250225 ; (platform_system == "Windows") and extra == 'npu'
|
64
64
|
Provides-Extra: serving
|
65
65
|
Requires-Dist: py-cpuinfo ; extra == 'serving'
|
66
66
|
Requires-Dist: fschat[model_worker,webui] ==0.2.36 ; extra == 'serving'
|
@@ -80,9 +80,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu'
|
|
80
80
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu'
|
81
81
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu'
|
82
82
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu'
|
83
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
84
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
85
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
83
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250225 ; extra == 'xpu'
|
84
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250225 ; extra == 'xpu'
|
85
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250225 ; extra == 'xpu'
|
86
86
|
Provides-Extra: xpu-2-1
|
87
87
|
Requires-Dist: py-cpuinfo ; extra == 'xpu-2-1'
|
88
88
|
Requires-Dist: protobuf ; extra == 'xpu-2-1'
|
@@ -97,9 +97,9 @@ Requires-Dist: setuptools <70.0.0 ; extra == 'xpu-2-1'
|
|
97
97
|
Requires-Dist: torch ==2.1.0a0 ; extra == 'xpu-2-1'
|
98
98
|
Requires-Dist: torchvision ==0.16.0a0 ; extra == 'xpu-2-1'
|
99
99
|
Requires-Dist: intel-extension-for-pytorch ==2.1.10+xpu ; extra == 'xpu-2-1'
|
100
|
-
Requires-Dist: bigdl-core-xe-21 ==2.6.
|
101
|
-
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.
|
102
|
-
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.
|
100
|
+
Requires-Dist: bigdl-core-xe-21 ==2.6.0b20250225 ; extra == 'xpu-2-1'
|
101
|
+
Requires-Dist: bigdl-core-xe-batch-21 ==2.6.0b20250225 ; extra == 'xpu-2-1'
|
102
|
+
Requires-Dist: bigdl-core-xe-addons-21 ==2.6.0b20250225 ; extra == 'xpu-2-1'
|
103
103
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-2-1'
|
104
104
|
Requires-Dist: dpcpp-cpp-rt ==2024.0.2 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
105
105
|
Requires-Dist: mkl-dpcpp ==2024.0.0 ; (platform_system == "Windows") and extra == 'xpu-2-1'
|
@@ -117,7 +117,7 @@ Requires-Dist: setuptools ; extra == 'xpu-2-6'
|
|
117
117
|
Requires-Dist: torch ==2.6.0+xpu ; extra == 'xpu-2-6'
|
118
118
|
Requires-Dist: torchvision ==0.21.0+xpu ; extra == 'xpu-2-6'
|
119
119
|
Requires-Dist: torchaudio ==2.6.0+xpu ; extra == 'xpu-2-6'
|
120
|
-
Requires-Dist: bigdl-core-xe-all ==2.6.
|
120
|
+
Requires-Dist: bigdl-core-xe-all ==2.6.0b20250225 ; extra == 'xpu-2-6'
|
121
121
|
Requires-Dist: onednn-devel ==2025.0.1 ; extra == 'xpu-2-6'
|
122
122
|
Requires-Dist: onednn ==2025.0.1 ; extra == 'xpu-2-6'
|
123
123
|
Requires-Dist: dpcpp-cpp-rt ==2025.0.2 ; extra == 'xpu-2-6'
|
@@ -133,9 +133,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arc'
|
|
133
133
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arc'
|
134
134
|
Requires-Dist: tabulate ; extra == 'xpu-arc'
|
135
135
|
Requires-Dist: setuptools ; extra == 'xpu-arc'
|
136
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
137
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
138
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
136
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250225 ; extra == 'xpu-arc'
|
137
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250225 ; extra == 'xpu-arc'
|
138
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250225 ; extra == 'xpu-arc'
|
139
139
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arc'
|
140
140
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
141
141
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arc'
|
@@ -156,9 +156,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-arl'
|
|
156
156
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-arl'
|
157
157
|
Requires-Dist: tabulate ; extra == 'xpu-arl'
|
158
158
|
Requires-Dist: setuptools ; extra == 'xpu-arl'
|
159
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
160
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
161
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
159
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250225 ; extra == 'xpu-arl'
|
160
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250225 ; extra == 'xpu-arl'
|
161
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250225 ; extra == 'xpu-arl'
|
162
162
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-arl'
|
163
163
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
164
164
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-arl'
|
@@ -179,9 +179,9 @@ Requires-Dist: tokenizers ==0.15.2 ; extra == 'xpu-lnl'
|
|
179
179
|
Requires-Dist: accelerate ==0.23.0 ; extra == 'xpu-lnl'
|
180
180
|
Requires-Dist: tabulate ; extra == 'xpu-lnl'
|
181
181
|
Requires-Dist: setuptools ; extra == 'xpu-lnl'
|
182
|
-
Requires-Dist: bigdl-core-xe-23 ==2.6.
|
183
|
-
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.
|
184
|
-
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.
|
182
|
+
Requires-Dist: bigdl-core-xe-23 ==2.6.0b20250225 ; extra == 'xpu-lnl'
|
183
|
+
Requires-Dist: bigdl-core-xe-batch-23 ==2.6.0b20250225 ; extra == 'xpu-lnl'
|
184
|
+
Requires-Dist: bigdl-core-xe-addons-23 ==2.6.0b20250225 ; extra == 'xpu-lnl'
|
185
185
|
Requires-Dist: intel-openmp ; (platform_machine == "x86_64" or platform_machine == "AMD64") and extra == 'xpu-lnl'
|
186
186
|
Requires-Dist: torch ==2.3.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
187
187
|
Requires-Dist: torchvision ==0.18.1+cxx11.abi ; (platform_system == "Linux") and extra == 'xpu-lnl'
|
@@ -94,7 +94,7 @@ ipex_llm/serving/fastchat/tgi_api_protocol.py,sha256=brT3k3-V0NJrU4fRqUwWjC0O3iO
|
|
94
94
|
ipex_llm/serving/fastchat/tgi_api_server.py,sha256=agNTAEiZPSuj3dEdIdYKwkoY0cXOUDX06DiM9VP2knQ,24418
|
95
95
|
ipex_llm/serving/fastchat/vllm_worker.py,sha256=ZLz2Q9GxJO6r_LOiP6epgCRjBGk-K4EB1SNEWSJp5DA,11091
|
96
96
|
ipex_llm/transformers/__init__.py,sha256=BreA3EY6hkNq0rVixb_sUuTLzMrcWXTt3yRsshCPHHQ,1214
|
97
|
-
ipex_llm/transformers/convert.py,sha256
|
97
|
+
ipex_llm/transformers/convert.py,sha256=294hk2uMQPN0DrPIpqhfgFvR7klvlHKS53DUUhoWaeU,103273
|
98
98
|
ipex_llm/transformers/convert_ipex.py,sha256=_nSnUTQy-yfkKaqGdqnBdWztZf3NGmnbZ0TKaDrF4X4,14617
|
99
99
|
ipex_llm/transformers/embedding.py,sha256=bdgk59DvD4ZZyxRzewXOR7g56nThgO6uhIwk8QL7f-s,9299
|
100
100
|
ipex_llm/transformers/kv.py,sha256=k4TU18LlA-Sbq9WNNQnfuzu3RSFBwFhmaV3BcGN5bAo,19191
|
@@ -113,6 +113,7 @@ ipex_llm/transformers/speculative.py,sha256=0XNLgc9dGswJHVPrXo4iM7pPxkWwfFfJMECc
|
|
113
113
|
ipex_llm/transformers/streamer.py,sha256=RrVlLblzCOtABRUpaMXAyaMnCGgLUtAi_YesLumRbww,4842
|
114
114
|
ipex_llm/transformers/training_patch.py,sha256=oxMkUtqyvqJiprw6dE3skkYfD1HOmUlH9N0hBkbn0G0,10799
|
115
115
|
ipex_llm/transformers/utils.py,sha256=a-2wbflSd_yYnC5qcMoY5HLR1yT_QpxeX_WpGpaDLrA,17457
|
116
|
+
ipex_llm/transformers/xgrammar.py,sha256=dd-e0DO0s-t-idngVzncnPAa_Gxb8YNoRJ3WROkwrs8,1840
|
116
117
|
ipex_llm/transformers/xpu_customize_fwd.py,sha256=PUBYLnTbaBXUs3Dnte9Gqln2XFk8iA62SmloWjr7GJI,7668
|
117
118
|
ipex_llm/transformers/xpu_ops.py,sha256=z95iTtcDQvNyJOvB4A6B_ECTYjHp4A7x-FsssoETOMs,4914
|
118
119
|
ipex_llm/transformers/awq/__init__.py,sha256=Du5gu3-eeAkeDO_dEMBTzrDBA66DSN3uL3-rn8WGXQw,875
|
@@ -152,8 +153,9 @@ ipex_llm/transformers/models/chatglm.py,sha256=UHai1t2AUtGmF765_eHF8LUMVQzp_oCBx
|
|
152
153
|
ipex_llm/transformers/models/chatglm2.py,sha256=KyAIX7zGVQDQuwwM3QMBNWZbTeMHEzKUIgAryT0voHc,14933
|
153
154
|
ipex_llm/transformers/models/chatglm4.py,sha256=QvUehdaCePB3MNHyWg3dneDxmjtBdxYeKUyQUVcsgfM,16886
|
154
155
|
ipex_llm/transformers/models/chatglm4v.py,sha256=L6y45M_wjS2_HqchmCUxRlQZUNuSNCGOiynAQrGh918,14124
|
155
|
-
ipex_llm/transformers/models/common.py,sha256=
|
156
|
+
ipex_llm/transformers/models/common.py,sha256=0OTRaXekOPApRdQ8UKl5Du8DOtKJ6awnQIStvYvFQOI,13018
|
156
157
|
ipex_llm/transformers/models/decilm.py,sha256=P-PBuDPf07GvKggLwJx_wPwIn6esN3rX8ai2JxRuZmE,5246
|
158
|
+
ipex_llm/transformers/models/deepseek.py,sha256=2w2bWbbuYi__fPs56vE9Wq5bdiZCF2NkYJNXf-b9LjQ,11130
|
157
159
|
ipex_llm/transformers/models/deepseek_v3.py,sha256=CTgwIKQlUPlUCbOxc9Id5GapWkXOP6pMtkguYrWpCio,10003
|
158
160
|
ipex_llm/transformers/models/gemma.py,sha256=_E3Yw8Y45xyNVeLqyVKcpr8kjuICtETeL82cJ-bWJuU,9424
|
159
161
|
ipex_llm/transformers/models/gemma2.py,sha256=2WZuv-FLzJyTJFaYxOuzJt47QE64M0lHnzAiO5T6ozI,8049
|
@@ -166,7 +168,7 @@ ipex_llm/transformers/models/internvl.py,sha256=Vx0vENIEQLX2M6P398mw5TOhpks0U8xf
|
|
166
168
|
ipex_llm/transformers/models/janus.py,sha256=0URo2NC8_2CGaOl3CiVB3IFTVsYyplMFgjBJdPDNBsY,1509
|
167
169
|
ipex_llm/transformers/models/llama.py,sha256=rqrNjuZb_jeb9MKx0z-FSVoGx8YDBxQzPJ9ZUvYhgx0,9138
|
168
170
|
ipex_llm/transformers/models/minicpm.py,sha256=eaPNVNrep0_xGoELhZd886ff0ceoKqB6cusdAhd52eE,10145
|
169
|
-
ipex_llm/transformers/models/minicpm3.py,sha256=
|
171
|
+
ipex_llm/transformers/models/minicpm3.py,sha256=37P_yMjw8RIzy27qL_E7kzbQRNW6f0xYQNK9xtoe5kI,10183
|
170
172
|
ipex_llm/transformers/models/minicpmv.py,sha256=PP05b5iTnrMpiseCn8iJcxKJDnfq7WqXp9Mrch0kKZ0,9876
|
171
173
|
ipex_llm/transformers/models/mistral.py,sha256=uVhkdXaq15v1P3QY0emVsA7SxUbAWChHEEXYN-drjpQ,7449
|
172
174
|
ipex_llm/transformers/models/mllama.py,sha256=ZyRq9DTKsvk1AlRbr-z6ngjS3Sr_7YuGZ6-Yr1MBBAM,10937
|
@@ -260,11 +262,11 @@ ipex_llm/vllm/xpu/engine/__init__.py,sha256=pY_CpyuZd72fr6s32ejeKHKFW0K4vUU2rzZj
|
|
260
262
|
ipex_llm/vllm/xpu/engine/engine.py,sha256=NvCMbp0X8NVrOqbwm4FTvXOptTRLzu9jQsy37ZHnTk8,9493
|
261
263
|
ipex_llm/vllm/xpu/entrypoints/openai/api_server.py,sha256=D577nxWlyoWaHXNXIEvS3ViKSSWL3XZq8D8t6izD7x4,33250
|
262
264
|
ipex_llm/vllm/xpu/entrypoints/openai/cli_args.py,sha256=hB398yYtKauASRzevctScdbFIjiiSGMAe1bwEuIHrhY,10893
|
263
|
-
ipex_llm-2.2.
|
264
|
-
ipex_llm-2.2.
|
265
|
-
ipex_llm-2.2.
|
266
|
-
ipex_llm-2.2.
|
267
|
-
ipex_llm-2.2.
|
268
|
-
ipex_llm-2.2.
|
269
|
-
ipex_llm-2.2.
|
270
|
-
ipex_llm-2.2.
|
265
|
+
ipex_llm-2.2.0b20250225.data/scripts/ipex-llm-init,sha256=fLQsT2dRL6H5bThb4GuIWotAuqoLsIxFwA-0c2qmaO8,6672
|
266
|
+
ipex_llm-2.2.0b20250225.data/scripts/llm-chat,sha256=TdUnUmNapzuoe1c8IzrdVOQwWEg8IqsMSBRlOD3daZM,2249
|
267
|
+
ipex_llm-2.2.0b20250225.data/scripts/llm-cli,sha256=RXGPlLElHxcKzoUxljEMBIAXbzCDysXL-Nxw-xF-7LU,2457
|
268
|
+
ipex_llm-2.2.0b20250225.dist-info/METADATA,sha256=3_EbaWHFTUZ3JXASEqCh8-KfRdJ-s0TRsdOk6L2-Fyo,12369
|
269
|
+
ipex_llm-2.2.0b20250225.dist-info/WHEEL,sha256=PPJcBMAZibF_2GFE9NmOJGqiaSMPiNFbJd6QaJjdA6Y,109
|
270
|
+
ipex_llm-2.2.0b20250225.dist-info/entry_points.txt,sha256=TiUyBB2MRmfF3ko-pyAEzqeBCRnyhu27bNOAsWPp3e8,61
|
271
|
+
ipex_llm-2.2.0b20250225.dist-info/top_level.txt,sha256=CGCMHM-SyqUabU4h8RqJ2KTYckQUO3LvIWwmUQ6Qbzw,9
|
272
|
+
ipex_llm-2.2.0b20250225.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|