ipex-llm 2.2.0b20250218__py3-none-manylinux2010_x86_64.whl → 2.2.0b20250220__py3-none-manylinux2010_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ipex_llm/optimize.py CHANGED
@@ -170,9 +170,11 @@ def load_low_bit(model, model_path):
170
170
  invalidInputError(isinstance(model, torch.nn.Module),
171
171
  "model should be an instance of `torch.nn.Module`, "
172
172
  f"but got {type(model)} at last.")
173
- invalidInputError(model.device.type in ('cpu', 'meta'),
174
- "Expect model on device `cpu` or `meta`, "
175
- f"but got device type {model.device.type}")
173
+ if hasattr(model, "device"):
174
+ # vLLM do not have device for model
175
+ invalidInputError(model.device.type in ('cpu', 'meta'),
176
+ "Expect model on device `cpu` or `meta`, "
177
+ f"but got device type {model.device.type}")
176
178
  qtype = ggml_tensor_qtype[low_bit]
177
179
  model = ggml_convert_low_bit(model, qtype=qtype, convert_shape_only=True)
178
180
 
@@ -15,7 +15,7 @@
15
15
  #
16
16
 
17
17
 
18
- from .convert import ggml_convert_low_bit, get_enable_ipex
18
+ from .convert import ggml_convert_low_bit, get_enable_ipex, convert_model_hybrid
19
19
  from .model import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM, \
20
20
  AutoModelForSpeechSeq2Seq, AutoModelForQuestionAnswering, \
21
21
  AutoModelForSequenceClassification, AutoModelForMaskedLM, \
@@ -2025,3 +2025,30 @@ def _optimize_post(model):
2025
2025
  _optimize_post(model.language_model)
2026
2026
 
2027
2027
  return model
2028
+
2029
+
2030
+ def convert_forward_to_xpu(m, target_m, new_forward):
2031
+ # print(m.__class__.__name__)
2032
+ if m.__class__ == target_m:
2033
+ bound_method = new_forward.__get__(m, m.__class__)
2034
+ setattr(m, "forward", bound_method)
2035
+ m = m.to(device="xpu", dtype=torch.float16)
2036
+ for _, sub_m in m.named_children():
2037
+ convert_forward_to_xpu(sub_m, target_m, new_forward)
2038
+
2039
+
2040
+ def convert_model_hybrid(model):
2041
+ if model.config.model_type == "deepseek_v3":
2042
+ modeling_module_name = model.__class__.__module__
2043
+ module = importlib.import_module(modeling_module_name)
2044
+ from ipex_llm.transformers.models.deepseek_v3 import (
2045
+ hybrid_DeepseekV3Attention_forward,
2046
+ hybrid_DeepseekV3MLP_forward,
2047
+ )
2048
+
2049
+ first_k_dense_replace = model.config.first_k_dense_replace
2050
+ convert_forward_to_xpu(model, module.DeepseekV3Attention,
2051
+ hybrid_DeepseekV3Attention_forward)
2052
+ convert_forward_to_xpu(model.model.layers[:first_k_dense_replace], module.DeepseekV3MLP,
2053
+ hybrid_DeepseekV3MLP_forward)
2054
+ return model
@@ -230,7 +230,7 @@ def scaled_dot_product_attention(query: torch.Tensor, key: torch.Tensor,
230
230
  if (
231
231
  device.type == "xpu"
232
232
  and dtype in [torch.float, torch.half]
233
- and head_dim in [64, 80, 96, 128]
233
+ and head_dim in [64, 80, 96, 128, 192, 256]
234
234
  ):
235
235
  # prepare scale
236
236
  scale = 1 / math.sqrt(head_dim) if scale is None else scale
@@ -0,0 +1,233 @@
1
+ #
2
+ # Copyright 2016 The BigDL Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ # Some parts of this file is adapted from
17
+ # https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/llama/modeling_llama.py
18
+ # which is licensed under Apache License 2.0:
19
+ #
20
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
21
+ #
22
+ # Licensed under the Apache License, Version 2.0 (the "License");
23
+ # you may not use this file except in compliance with the License.
24
+ # You may obtain a copy of the License at
25
+ #
26
+ # http://www.apache.org/licenses/LICENSE-2.0
27
+ #
28
+ # Unless required by applicable law or agreed to in writing, software
29
+ # distributed under the License is distributed on an "AS IS" BASIS,
30
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ # See the License for the specific language governing permissions and
32
+ # limitations under the License.
33
+
34
+ from typing import List, Optional, Tuple, Union
35
+ import warnings
36
+
37
+ import torch
38
+ from torch import nn
39
+ from transformers.cache_utils import Cache
40
+
41
+ from ipex_llm.transformers.models.utils import apply_rotary_pos_emb
42
+ from ipex_llm.utils.common import invalidInputError
43
+
44
+
45
+ def hybrid_DeepseekV3MLP_forward(self, x):
46
+ x = x.to(device="xpu", dtype=torch.float16)
47
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
48
+ return down_proj.to(device="cpu", dtype=torch.bfloat16)
49
+
50
+
51
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
52
+ def rotate_half(x):
53
+ """Rotates half the hidden dims of the input."""
54
+ x1 = x[..., : x.shape[-1] // 2]
55
+ x2 = x[..., x.shape[-1] // 2:]
56
+ return torch.cat((-x2, x1), dim=-1)
57
+
58
+
59
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
60
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
61
+ """Applies Rotary Position Embedding to the query and key tensors.
62
+ Args:
63
+ q (`torch.Tensor`): The query tensor.
64
+ k (`torch.Tensor`): The key tensor.
65
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
66
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
67
+ position_ids (`torch.Tensor`):
68
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
69
+ used to pass offsetted position ids when working with a KV-cache. # noqa
70
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
71
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
72
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
73
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
74
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
75
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
76
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
77
+ Returns:
78
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
79
+ """
80
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
81
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
82
+
83
+ b, h, s, d = q.shape
84
+ q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
85
+
86
+ b, h, s, d = k.shape
87
+ k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
88
+
89
+ q_embed = (q * cos) + (rotate_half(q) * sin)
90
+ k_embed = (k * cos) + (rotate_half(k) * sin)
91
+ return q_embed, k_embed
92
+
93
+
94
+ # Copied from modeling_deepseek.DeepseekV3Attention
95
+ def DeepseekV3Attention_forward(
96
+ self,
97
+ hidden_states: torch.Tensor,
98
+ attention_mask: Optional[torch.Tensor] = None,
99
+ position_ids: Optional[torch.LongTensor] = None,
100
+ past_key_value: Optional[Cache] = None,
101
+ output_attentions: bool = False,
102
+ use_cache: bool = False,
103
+ **kwargs,
104
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
105
+ if "padding_mask" in kwargs:
106
+ warnings.warn(
107
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" # noqa
108
+ )
109
+ bsz, q_len, _ = hidden_states.size()
110
+
111
+ if self.q_lora_rank is None:
112
+ q = self.q_proj(hidden_states)
113
+ else:
114
+ q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
115
+ q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
116
+ q_nope, q_pe = torch.split(
117
+ q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
118
+ )
119
+
120
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
121
+ compressed_kv, k_pe = torch.split(
122
+ compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
123
+ )
124
+ k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
125
+ kv = (
126
+ self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
127
+ .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
128
+ .transpose(1, 2)
129
+ )
130
+
131
+ k_nope, value_states = torch.split(
132
+ kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
133
+ )
134
+ kv_seq_len = value_states.shape[-2]
135
+ if past_key_value is not None:
136
+ if self.layer_idx is None:
137
+ raise ValueError( # noqa
138
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " # noqa
139
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " # noqa
140
+ "with a layer index."
141
+ )
142
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
143
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
144
+
145
+ q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
146
+
147
+ query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
148
+ query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
149
+ query_states[:, :, :, self.qk_nope_head_dim:] = q_pe
150
+
151
+ key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
152
+ key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
153
+ key_states[:, :, :, self.qk_nope_head_dim:] = k_pe
154
+ if past_key_value is not None:
155
+ cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
156
+ key_states, value_states = past_key_value.update(
157
+ key_states, value_states, self.layer_idx, cache_kwargs
158
+ )
159
+
160
+ attn_weights = (
161
+ torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
162
+ )
163
+
164
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
165
+ raise ValueError( # noqa
166
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" # noqa
167
+ f" {attn_weights.size()}"
168
+ )
169
+ assert attention_mask is not None # noqa
170
+ if attention_mask is not None:
171
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
172
+ raise ValueError( # noqa
173
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" # noqa
174
+ )
175
+ attn_weights = attn_weights + attention_mask
176
+
177
+ # upcast attention to fp32
178
+ attn_weights = nn.functional.softmax(
179
+ attn_weights, dim=-1, dtype=torch.float32
180
+ ).to(query_states.dtype)
181
+ attn_weights = nn.functional.dropout(
182
+ attn_weights, p=self.attention_dropout, training=self.training
183
+ )
184
+ attn_output = torch.matmul(attn_weights, value_states)
185
+
186
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
187
+ raise ValueError( # noqa
188
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is" # noqa
189
+ f" {attn_output.size()}"
190
+ )
191
+
192
+ attn_output = attn_output.transpose(1, 2).contiguous()
193
+
194
+ attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
195
+
196
+ attn_output = self.o_proj(attn_output)
197
+
198
+ if not output_attentions:
199
+ attn_weights = None
200
+
201
+ return attn_output, attn_weights, past_key_value
202
+
203
+
204
+ def hybrid_DeepseekV3Attention_forward(
205
+ self,
206
+ hidden_states: torch.Tensor,
207
+ attention_mask: Optional[torch.Tensor] = None,
208
+ position_ids: Optional[torch.LongTensor] = None,
209
+ past_key_value: Optional[Cache] = None,
210
+ output_attentions: bool = False,
211
+ use_cache: bool = False,
212
+ xpu_device: str = "xpu",
213
+ **kwargs,
214
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
215
+ hidden_states = hidden_states.to(device="xpu", dtype=torch.float16)
216
+ attention_mask = attention_mask.to(device="xpu", dtype=torch.float16)
217
+ position_ids = position_ids.to(device="xpu")
218
+ if past_key_value is not None:
219
+ past_key_value = past_key_value.to(device="xpu", dtype=torch.float16)
220
+
221
+ attn_output, attn_weights, past_key_value = DeepseekV3Attention_forward(
222
+ self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs # noqa
223
+ )
224
+
225
+ if attn_output is not None:
226
+ attn_output = attn_output.to(device="cpu", dtype=torch.bfloat16)
227
+ if attn_weights is not None:
228
+ attn_weights = attn_weights.to(device="cpu", dtype=torch.bfloat16)
229
+ if past_key_value is not None:
230
+ past_key_value = past_key_value.to(device="cpu", dtype=torch.bfloat16)
231
+ torch.xpu.empty_cache()
232
+
233
+ return attn_output, attn_weights, past_key_value