ipex-llm 2.2.0b20250205__py3-none-win_amd64.whl → 2.2.0b20250207__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. ipex_llm/libs/bloom-api.dll +0 -0
  2. ipex_llm/libs/bloom.dll +0 -0
  3. ipex_llm/libs/gptneox-api.dll +0 -0
  4. ipex_llm/libs/gptneox.dll +0 -0
  5. ipex_llm/libs/libbloom_avx.dll +0 -0
  6. ipex_llm/libs/libbloom_vnni.dll +0 -0
  7. ipex_llm/libs/libgptneox_avx.dll +0 -0
  8. ipex_llm/libs/libgptneox_vnni.dll +0 -0
  9. ipex_llm/libs/libllama_avx.dll +0 -0
  10. ipex_llm/libs/libllama_vnni.dll +0 -0
  11. ipex_llm/libs/libstarcoder_avx.dll +0 -0
  12. ipex_llm/libs/libstarcoder_vnni.dll +0 -0
  13. ipex_llm/libs/llama-api.dll +0 -0
  14. ipex_llm/libs/llama.dll +0 -0
  15. ipex_llm/libs/main-bloom.exe +0 -0
  16. ipex_llm/libs/main-gptneox.exe +0 -0
  17. ipex_llm/libs/main-llama.exe +0 -0
  18. ipex_llm/libs/main-starcoder.exe +0 -0
  19. ipex_llm/libs/pipeline.dll +0 -0
  20. ipex_llm/libs/quantize-bloom.exe +0 -0
  21. ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
  22. ipex_llm/libs/quantize-gptneox.exe +0 -0
  23. ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
  24. ipex_llm/libs/quantize-llama.exe +0 -0
  25. ipex_llm/libs/quantize-llama_vnni.exe +0 -0
  26. ipex_llm/libs/quantize-starcoder.exe +0 -0
  27. ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
  28. ipex_llm/libs/starcoder-api.dll +0 -0
  29. ipex_llm/libs/starcoder.dll +0 -0
  30. ipex_llm/transformers/low_bit_linear.py +5 -4
  31. ipex_llm/transformers/model.py +0 -1
  32. ipex_llm/transformers/npu_model.py +17 -5
  33. ipex_llm/transformers/npu_models/convert.py +6 -2
  34. ipex_llm/transformers/npu_models/qwen2_mp.py +20 -8
  35. ipex_llm/transformers/npu_pipeline_model/common.py +161 -0
  36. ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +33 -13
  37. ipex_llm/transformers/npu_pipeline_model/llama.py +20 -159
  38. ipex_llm/transformers/npu_pipeline_model/minicpm.py +19 -10
  39. ipex_llm/transformers/npu_pipeline_model/qwen.py +57 -36
  40. ipex_llm/transformers/qlora.py +2 -2
  41. ipex_llm/transformers/utils.py +19 -6
  42. ipex_llm/transformers/xpu_customize_fwd.py +6 -4
  43. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/METADATA +23 -30
  44. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/RECORD +50 -50
  45. {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/ipex-llm-init.bat +0 -0
  46. {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-chat.ps1 +0 -0
  47. {ipex_llm-2.2.0b20250205.data → ipex_llm-2.2.0b20250207.data}/scripts/llm-cli.ps1 +0 -0
  48. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/WHEEL +0 -0
  49. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/entry_points.txt +0 -0
  50. {ipex_llm-2.2.0b20250205.dist-info → ipex_llm-2.2.0b20250207.dist-info}/top_level.txt +0 -0
@@ -18,112 +18,13 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer
23
- from intel_npu_acceleration_library.backend.factory import NNFactory
24
-
25
-
26
- class Llama32Embedding(NNFactory):
27
- def __init__(
28
- self,
29
- vocab_size,
30
- embedding_dim,
31
- embedding_weight,
32
- padding_idx,
33
- inv_freq,
34
- attention_scaling,
35
- dtype, # fp16
36
- device: str = "NPU",
37
- ):
38
- super().__init__(False, device)
39
- self.vocab_size = vocab_size
40
- self.embedding_dim = embedding_dim
41
- self.padding_idx = padding_idx
42
- self.attention_scaling = attention_scaling
43
- self.dtype = dtype
44
-
45
- # define input
46
- weight = self.constant(embedding_weight)
47
- input = self.parameter((1, 1), dtype=np.int32)
48
- position_ids = self.parameter((1, 1), dtype=np.int64)
49
- inv_freq = self.constant(inv_freq)
50
-
51
- # embed_tokens module
52
- if padding_idx == -1:
53
- padding_idx += vocab_size
54
-
55
- axis_node = self.constant(np.array([0], dtype=np.int64))
56
- if padding_idx is not None:
57
- masked_embeddings = np.ones(weight.shape, dtype=np.float16)
58
- masked_embeddings[padding_idx, :] = 0.0 # mask
59
-
60
- node_mask = self.constant(masked_embeddings)
61
- node_masked_w = self.eltwise_mul(weight, node_mask)
62
- res = self.gather(node_masked_w, input, axis_node, 0)
63
- else:
64
- res = self.gather(weight, input, axis_node, 0)
65
-
66
- # rotary_emb module
67
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
68
- position_ids = self.reshape(position_ids, (1, 1, 1))
69
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
70
- self.convert_to_fp32(position_ids))
71
- freqs = self.transpose(freqs, [0, 2, 1])
72
- emb = self.concat(freqs, freqs, axis=2)
73
- cos = self.cos(emb)
74
- sin = self.sin(emb)
75
- cos = cos * self.attention_scaling
76
- sin = sin * self.attention_scaling
77
-
78
- # define outputs
79
- res = self.convert_to_fp16(res)
80
- cos = self.convert_to_fp32(cos)
81
- sin = self.convert_to_fp32(sin)
82
-
83
- print("start compiling")
84
- self.compile()
85
-
86
-
87
- class Llama32PostEmbedding(NNFactory):
88
- def __init__(
89
- self,
90
- inv_freq,
91
- attention_scaling,
92
- input_len: int = 1,
93
- device: str = "NPU",
94
- ):
95
- super().__init__(False, device)
96
- self.attention_scaling = attention_scaling
97
-
98
- # define input
99
- position_ids = self.parameter((1, input_len), dtype=np.int64)
100
- inv_freq = self.constant(inv_freq)
101
-
102
- # rotary_emb module
103
- inv_freq = self.reshape(inv_freq, (1, inv_freq.shape[0], 1))
104
- position_ids = self.reshape(position_ids, (1, 1, input_len))
105
- freqs = self.eltwise_mul(self.convert_to_fp32(inv_freq),
106
- self.convert_to_fp32(position_ids))
107
- freqs = self.transpose(freqs, [0, 2, 1])
108
- emb = self.concat(freqs, freqs, axis=2)
109
- cos = self.cos(emb)
110
- sin = self.sin(emb)
111
- cos = cos * self.attention_scaling
112
- sin = sin * self.attention_scaling
113
- if input_len > 1:
114
- cos = self.unsqueeze(cos, [1])
115
- sin = self.unsqueeze(sin, [1])
116
-
117
- # define outputs
118
- cos = self.convert_to_fp32(cos)
119
- sin = self.convert_to_fp32(sin)
120
-
121
- print("start compiling")
122
- self.compile()
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_embedding_from_model
123
23
 
124
24
 
125
25
  def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
126
- convert_model=False, max_prompt_len=1):
26
+ convert_model=False, max_prompt_len=1,
27
+ keep_ir=False, compile_blob=True):
127
28
  num_heads = model.model.layers[0].self_attn.num_heads
128
29
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
129
30
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -175,7 +76,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
175
76
  asym=asym
176
77
  )
177
78
  last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
178
- True, False)
79
+ keep_ir=keep_ir, compile_blob=compile_blob)
80
+ os.remove(os.path.join(temp_dir, "lm_head.bin"))
179
81
 
180
82
  # save weights bins files
181
83
  if n_splits_linear == 1:
@@ -195,62 +97,18 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
195
97
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
196
98
  weight.tofile(bin_file)
197
99
 
198
- if hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
199
- # llama-2-7B & llama-3-8B
200
- embedding_layer = model.model.embed_tokens
201
- new_embedding = LLMEmbedding(
202
- vocab_size=model.config.vocab_size,
203
- embedding_dim=model.config.hidden_size,
204
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
205
- padding_idx=model.config.pad_token_id,
206
- dtype=np.float16,
207
- )
208
- if convert_model:
209
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
210
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
211
- first_blob_path = None
212
- else:
213
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
214
- temp_dir, True, False)
215
- else:
216
- # llama-3.2-3B & llama-3.2-1B
217
- embedding_layer = model.model.embed_tokens
218
- new_embedding = Llama32Embedding(
219
- vocab_size=model.config.vocab_size,
220
- embedding_dim=model.config.hidden_size,
221
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
222
- padding_idx=model.config.pad_token_id,
223
- inv_freq=model.model.rotary_emb.inv_freq.to(torch.float16),
224
- attention_scaling=model.model.rotary_emb.attention_scaling,
225
- dtype=np.float16,
226
- )
227
- if convert_model:
228
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
229
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
230
- first_blob_path = None
231
- # save embedding post module
232
- inv_freq = model.model.rotary_emb.inv_freq.to(torch.float16)
233
- attention_scaling = model.model.rotary_emb.attention_scaling
234
- embedding_post = Llama32PostEmbedding(inv_freq=inv_freq,
235
- attention_scaling=attention_scaling,
236
- input_len=1)
237
- update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
238
- temp_dir, True, False)
239
- embedding_post_prefill = Llama32PostEmbedding(inv_freq=inv_freq,
240
- attention_scaling=attention_scaling,
241
- input_len=max_prompt_len)
242
- update_names_of_IR_and_export_blob(embedding_post_prefill,
243
- "embedding_post_prefill",
244
- temp_dir, True, False)
245
- else:
246
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
247
- temp_dir)
100
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
101
+ temp_dir, weight_dir,
102
+ max_prompt_len,
103
+ keep_ir, compile_blob)
104
+
248
105
  return first_blob_path, last_blob_path
249
106
 
250
107
 
251
108
  def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
252
109
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
253
- layernorm_const, mode="decode"):
110
+ layernorm_const, mode="decode",
111
+ keep_ir=False, compile_blob=True):
254
112
  num_heads = model.model.layers[0].self_attn.num_heads
255
113
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
256
114
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -317,8 +175,9 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
317
175
  rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
318
176
  decoder_name,
319
177
  temp_dir,
320
- True, False,
178
+ keep_ir=keep_ir, compile_blob=compile_blob,
321
179
  npu_dpu_groups=npu_dpu_groups)
180
+ os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
322
181
 
323
182
  if mode == "decode":
324
183
  if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
@@ -364,7 +223,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
364
223
 
365
224
  def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
366
225
  save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
367
- layernorm_const, mode="decode"):
226
+ layernorm_const, mode="decode",
227
+ keep_ir=False, compile_blob=True):
368
228
  num_heads = model.model.layers[0].self_attn.num_heads
369
229
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
370
230
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -457,6 +317,7 @@ def convert_fused_llama_layer(model, fused_layers, n_splits_linear, n_splits_dow
457
317
  update_names_of_IR_and_export_blob(fused_decoder,
458
318
  f"decoder_layer_{i}",
459
319
  save_dir,
460
- compile_blob=True,
461
- keep_ir=False)
320
+ keep_ir=keep_ir,
321
+ compile_blob=compile_blob)
322
+ os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
462
323
  return 0
@@ -162,7 +162,8 @@ class MiniCPMLMHead(LLMBaseNNFactory):
162
162
 
163
163
 
164
164
  def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
165
- convert_model=False, max_prompt_len=1):
165
+ convert_model=False, max_prompt_len=1,
166
+ keep_ir=False, compile_blob=True):
166
167
  num_heads = model.model.layers[0].self_attn.num_heads
167
168
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
168
169
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -230,7 +231,8 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
230
231
  asym=asym
231
232
  )
232
233
  last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, "lm_head", temp_dir,
233
- True, True)
234
+ keep_ir=keep_ir, compile_blob=compile_blob)
235
+ os.remove(os.path.join(temp_dir, "lm_head.bin"))
234
236
 
235
237
  # save weights bins files
236
238
  if n_splits_linear == 1:
@@ -280,22 +282,27 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
280
282
  dtype=np.float16,
281
283
  scale_emb=model.config.scale_emb)
282
284
  update_names_of_IR_and_export_blob(embedding_post, "embedding_post",
283
- temp_dir, True, False)
285
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
284
286
  embedding_post_prefill = MiniCPMPostEmbedding(max_prompt_len, model.config.hidden_size,
285
287
  dtype=np.float16,
286
288
  scale_emb=model.config.scale_emb)
287
289
  update_names_of_IR_and_export_blob(embedding_post_prefill,
288
290
  "embedding_post_prefill",
289
- temp_dir, True, False)
291
+ temp_dir, keep_ir=keep_ir, compile_blob=compile_blob)
292
+ os.remove(os.path.join(temp_dir, "embedding_post.bin"))
293
+ os.remove(os.path.join(temp_dir, "embedding_post_prefill.bin"))
290
294
  else:
291
295
  first_blob_path = update_names_of_IR_and_export_blob(new_embedding, "embedding",
292
- temp_dir, True, False)
296
+ temp_dir, keep_ir=keep_ir,
297
+ compile_blob=compile_blob)
298
+ os.remove(os.path.join(temp_dir, "embedding.bin"))
293
299
  return first_blob_path, last_blob_path
294
300
 
295
301
 
296
302
  def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
297
303
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
298
- layernorm_const, mode="decode"):
304
+ layernorm_const, mode="decode",
305
+ keep_ir=False, compile_blob=True):
299
306
  num_heads = model.model.layers[0].self_attn.num_heads
300
307
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
301
308
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -353,7 +360,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
353
360
  rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
354
361
  decoder_name,
355
362
  temp_dir,
356
- True, True)
363
+ keep_ir=keep_ir, compile_blob=compile_blob)
364
+ os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
357
365
 
358
366
  if mode == "decode":
359
367
  if layernorm_const:
@@ -386,7 +394,8 @@ def convert_minicpm_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
386
394
 
387
395
  def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
388
396
  save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
389
- layernorm_const, mode="decode"):
397
+ layernorm_const, mode="decode",
398
+ keep_ir=False, compile_blob=True):
390
399
  num_heads = model.model.layers[0].self_attn.num_heads
391
400
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
392
401
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -477,6 +486,6 @@ def convert_fused_minicpm_layer(model, fused_layers, n_splits_linear, n_splits_d
477
486
  update_names_of_IR_and_export_blob(fused_decoder,
478
487
  f"decoder_layer_{i}",
479
488
  save_dir,
480
- compile_blob=True,
481
- keep_ir=False)
489
+ keep_ir=keep_ir, compile_blob=compile_blob)
490
+ os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
482
491
  return 0
@@ -18,13 +18,15 @@
18
18
  import torch
19
19
  import numpy as np
20
20
  import os
21
- from .common import update_names_of_IR_and_export_blob, LLMEmbedding, LowBitLLMLMHead, \
22
- obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer
21
+ from .common import update_names_of_IR_and_export_blob, LowBitLLMLMHead, \
22
+ obtain_weight_from_single_layer, obtain_qkv_bias_from_single_layer, \
23
+ obtain_embedding_from_model
23
24
  from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
24
25
 
25
26
 
26
27
  def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
27
- convert_model=False, group_size=0):
28
+ convert_model=False, group_size=0, max_prompt_len=1,
29
+ keep_ir=False, compile_blob=True):
28
30
  num_heads = model.model.layers[0].self_attn.num_heads
29
31
  head_dim = model.model.layers[0].self_attn.head_dim
30
32
  rms_norm_eps = model.config.rms_norm_eps
@@ -84,7 +86,9 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
84
86
  )
85
87
 
86
88
  last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
87
- temp_dir, True, False)
89
+ temp_dir, keep_ir=keep_ir,
90
+ compile_blob=compile_blob)
91
+ os.remove(os.path.join(temp_dir, "lm_head.bin"))
88
92
 
89
93
  # save weights bins files
90
94
  if not isinstance(lm_head, SlicedLMHead):
@@ -104,28 +108,17 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
104
108
  bin_file = os.path.join(weight_dir, f"model_lm_head_input_{1+idx}.bin")
105
109
  weight.tofile(bin_file)
106
110
 
107
- embedding_layer = model.model.embed_tokens
108
- new_embedding = LLMEmbedding(
109
- vocab_size=model.config.vocab_size,
110
- embedding_dim=model.config.hidden_size,
111
- embedding_weight=embedding_layer.weight.to(torch.float16).detach().numpy(),
112
- padding_idx=model.config.pad_token_id,
113
- dtype=np.float16,
114
- input_length=1,
115
- )
116
- if convert_model:
117
- bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
118
- embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
119
- first_blob_path = True
120
- else:
121
- first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding",
122
- temp_dir, True, keep_ir=True)
111
+ first_blob_path = obtain_embedding_from_model(model, convert_model,
112
+ temp_dir, weight_dir,
113
+ max_prompt_len,
114
+ keep_ir, compile_blob)
123
115
  return first_blob_path, last_blob_path
124
116
 
125
117
 
126
118
  def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
127
119
  temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
128
- layernorm_const, mode="decode"):
120
+ layernorm_const, mode="decode",
121
+ keep_ir=False, compile_blob=True):
129
122
  num_heads = model.model.layers[0].self_attn.num_heads
130
123
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
131
124
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -139,8 +132,13 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
139
132
  mlp_layer = curr_layer.mlp
140
133
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
141
134
  q_bias, k_bias, v_bias = obtain_qkv_bias_from_single_layer(attn_layer)
142
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
143
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
135
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
136
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
137
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
138
+ else:
139
+ # transformers >= 4.45.0
140
+ cached_cos = None
141
+ cached_sin = None
144
142
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
145
143
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
146
144
 
@@ -152,10 +150,12 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
152
150
  if mode == "decode":
153
151
  input_len = 1
154
152
  decoder_name = f"decoder_layer_{layer_idx}"
153
+ keep_position_ids = True
155
154
  npu_dpu_groups = None
156
155
  else:
157
156
  input_len = kv_len
158
157
  decoder_name = "decoder_layer_prefill"
158
+ keep_position_ids = False
159
159
  npu_dpu_groups = 6
160
160
 
161
161
  single_decoder = LowBitQwenMultiDecoderlayer(
@@ -179,23 +179,38 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
179
179
  n_splits_linear=n_splits_linear,
180
180
  n_splits_down_proj=n_splits_down_proj,
181
181
  group_size=group_size,
182
+ cos_len=input_len,
183
+ keep_position_ids=keep_position_ids,
182
184
  asym=asym
183
185
  )
184
186
  rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
185
187
  decoder_name,
186
- temp_dir, True, False,
188
+ temp_dir, keep_ir=keep_ir,
189
+ compile_blob=compile_blob,
187
190
  npu_dpu_groups=npu_dpu_groups)
191
+ os.remove(os.path.join(temp_dir, decoder_name + ".bin"))
188
192
 
189
193
  # 0, 1, 2 are input_embed/attention_mask/position_id
190
194
  if mode == "decode":
191
- if layernorm_const:
192
- st_idx = 3
195
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
196
+ if layernorm_const:
197
+ st_idx = 3
198
+ else:
199
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
200
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
201
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
202
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
203
+ st_idx = 5
193
204
  else:
194
- input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
195
- post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
196
- layer_norm_0.data.numpy().tofile(input_lm_bin_file)
197
- layer_norm_1.data.numpy().tofile(post_lm_bin_file)
198
- st_idx = 5
205
+ # transformers >= 4.45.0
206
+ if layernorm_const:
207
+ st_idx = 4
208
+ else:
209
+ input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
210
+ post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_5.bin")
211
+ layer_norm_0.data.numpy().tofile(input_lm_bin_file)
212
+ layer_norm_1.data.numpy().tofile(post_lm_bin_file)
213
+ st_idx = 6
199
214
  q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
200
215
  k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
201
216
  v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
@@ -226,7 +241,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
226
241
 
227
242
  def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj,
228
243
  save_dir, weight_dir, transpose_value_cache, kv_len, group_size,
229
- layernorm_const, mode="decode"):
244
+ layernorm_const, mode="decode",
245
+ keep_ir=False, compile_blob=True):
230
246
  num_heads = model.model.layers[0].self_attn.num_heads
231
247
  num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
232
248
  head_dim = model.model.layers[0].self_attn.head_dim
@@ -252,8 +268,13 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
252
268
  attn_layer = curr_layer.self_attn
253
269
  mlp_layer = curr_layer.mlp
254
270
  weights = obtain_weight_from_single_layer(attn_layer, mlp_layer)
255
- cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
256
- cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
271
+ if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
272
+ cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
273
+ cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
274
+ else:
275
+ # transformers >= 4.45.0
276
+ cached_cos = None
277
+ cached_sin = None
257
278
  layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
258
279
  layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
259
280
 
@@ -330,6 +351,6 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
330
351
  update_names_of_IR_and_export_blob(fused_decoder,
331
352
  f"decoder_layer_{i}",
332
353
  save_dir,
333
- compile_blob=True,
334
- keep_ir=False)
354
+ keep_ir=keep_ir, compile_blob=compile_blob)
355
+ os.remove(os.path.join(save_dir, f"decoder_layer_{i}" + ".bin"))
335
356
  return 0
@@ -109,7 +109,7 @@ class LoraLowBitLinear(Module, LoraLayer):
109
109
  self.qa_pool = torch.nn.Identity()
110
110
 
111
111
  def forward(self, x: torch.Tensor):
112
- autocast_dtype = get_autocast_dtype(x)
112
+ autocast_dtype = get_autocast_dtype(x.device.type)
113
113
  if x.device.type == "xpu":
114
114
  # force to use bf16 on gpu
115
115
  x = x.to(torch.bfloat16)
@@ -177,7 +177,7 @@ class LoraBF16Linear(Module, LoraLayer):
177
177
  self.is_target_conv_1d_layer = is_target_conv_1d_layer
178
178
 
179
179
  def forward(self, x: torch.Tensor):
180
- autocast_dtype = get_autocast_dtype(x)
180
+ autocast_dtype = get_autocast_dtype(x.device.type)
181
181
  if x.device.type == "xpu":
182
182
  # force to use bf16 on gpu
183
183
  x = x.to(torch.bfloat16)
@@ -138,26 +138,39 @@ def fix_key(key):
138
138
  return key
139
139
 
140
140
 
141
- def get_autocast_dtype(x):
141
+ def is_autocast_enabled(device_type: str):
142
142
  if torch.__version__ >= '2.3':
143
- if torch.is_autocast_enabled(x.device.type):
144
- return torch.get_autocast_dtype(x.device.type)
143
+ return torch.is_autocast_enabled(device_type)
144
+ else:
145
+ if device_type == "xpu":
146
+ return torch.xpu.is_autocast_xpu_enabled()
147
+ elif device_type == "cpu":
148
+ return torch.is_autocast_cpu_enabled()
149
+ else:
150
+ invalidInputError(False,
151
+ f"Device type {device_type} is not supported.")
152
+
153
+
154
+ def get_autocast_dtype(device_type: str):
155
+ if torch.__version__ >= '2.3':
156
+ if torch.is_autocast_enabled(device_type):
157
+ return torch.get_autocast_dtype(device_type)
145
158
  else:
146
159
  return None
147
160
  else:
148
- if x.device.type == "xpu":
161
+ if device_type == "xpu":
149
162
  if torch.xpu.is_autocast_xpu_enabled():
150
163
  return torch.xpu.get_autocast_xpu_dtype()
151
164
  else:
152
165
  return None
153
- elif x.device.type == "cpu":
166
+ elif device_type == "cpu":
154
167
  if torch.is_autocast_cpu_enabled():
155
168
  return torch.get_autocast_cpu_dtype()
156
169
  else:
157
170
  return None
158
171
  else:
159
172
  invalidInputError(False,
160
- f"Device {x.device} is not supported.")
173
+ f"Device type {device_type} is not supported.")
161
174
 
162
175
 
163
176
  def get_xpu_device_name(device: torch.device):
@@ -107,6 +107,8 @@ except ModuleNotFoundError:
107
107
  np = None # type: ignore[assignment]
108
108
  from typing import Any
109
109
 
110
+ from ipex_llm.transformers.utils import is_autocast_enabled, get_autocast_dtype
111
+
110
112
 
111
113
  def _cast(value, dtype):
112
114
  if isinstance(value, torch.Tensor):
@@ -155,12 +157,12 @@ def custom_fwd(fwd=None, *, cast_inputs=None):
155
157
 
156
158
  @functools.wraps(fwd)
157
159
  def decorate_fwd(*args, **kwargs):
158
- args[0]._dtype = torch.xpu.get_autocast_xpu_dtype()
160
+ args[0]._dtype = get_autocast_dtype("xpu")
159
161
  if cast_inputs is None:
160
- args[0]._fwd_used_autocast = torch.xpu.is_autocast_xpu_enabled()
162
+ args[0]._fwd_used_autocast = is_autocast_enabled("xpu")
161
163
  return fwd(*args, **kwargs)
162
164
  else:
163
- autocast_context = torch.xpu.is_autocast_xpu_enabled()
165
+ autocast_context = is_autocast_enabled("xpu")
164
166
  args[0]._fwd_used_autocast = False
165
167
  if autocast_context:
166
168
  with torch.xpu.autocast(enabled=False):
@@ -184,7 +186,7 @@ def custom_bwd(bwd):
184
186
 
185
187
  @functools.wraps(bwd)
186
188
  def decorate_bwd(*args, **kwargs):
187
- with torch.xpu.autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
189
+ with torch.autocast("xpu", enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
188
190
  return bwd(*args, **kwargs)
189
191
 
190
192
  return decorate_bwd